granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <math.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <unistd.h>
  24
  25 #include "access/clog.h"
  26 #include "access/commit_ts.h"
  27 #include "access/multixact.h"
  28 #include "access/rewriteheap.h"
  29 #include "access/subtrans.h"
  30 #include "access/timeline.h"
  31 #include "access/transam.h"
  32 #include "access/tuptoaster.h"
  33 #include "access/twophase.h"
  34 #include "access/xact.h"
  35 #include "access/xlog_internal.h"
  36 #include "access/xloginsert.h"
  37 #include "access/xlogreader.h"
  38 #include "access/xlogutils.h"
  39 #include "catalog/catversion.h"
  40 #include "catalog/pg_control.h"
  41 #include "catalog/pg_database.h"
  42 #include "commands/tablespace.h"
  43 #include "miscadmin.h"
  44 #include "pgstat.h"
  45 #include "port/atomics.h"
  46 #include "postmaster/bgwriter.h"
  47 #include "postmaster/walwriter.h"
  48 #include "postmaster/startup.h"
  49 #include "replication/basebackup.h"
  50 #include "replication/logical.h"
  51 #include "replication/slot.h"
  52 #include "replication/origin.h"
  53 #include "replication/snapbuild.h"
  54 #include "replication/walreceiver.h"
  55 #include "replication/walsender.h"
  56 #include "storage/bufmgr.h"
  57 #include "storage/fd.h"
  58 #include "storage/ipc.h"
  59 #include "storage/large_object.h"
  60 #include "storage/latch.h"
  61 #include "storage/pmsignal.h"
  62 #include "storage/predicate.h"
  63 #include "storage/proc.h"
  64 #include "storage/procarray.h"
  65 #include "storage/reinit.h"
  66 #include "storage/smgr.h"
  67 #include "storage/spin.h"
  68 #include "utils/backend_random.h"
  69 #include "utils/builtins.h"
  70 #include "utils/guc.h"
  71 #include "utils/memutils.h"
  72 #include "utils/pg_lsn.h"
  73 #include "utils/ps_status.h"
  74 #include "utils/relmapper.h"
  75 #include "utils/snapmgr.h"
  76 #include "utils/timestamp.h"
  77 #include "pg_trace.h"
  78
  79 extern uint32 bootstrap_data_checksum_version;
  80
  81 /* File path names (all relative to $PGDATA) */
  82 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  83 #define RECOVERY_COMMAND_DONE   "recovery.done"
  84 #define PROMOTE_SIGNAL_FILE             "promote"
  85 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  86
  87
  88 /* User-settable parameters */
  89 int                     max_wal_size = 64;      /* 1 GB */
  90 int                     min_wal_size = 5;       /* 80 MB */
  91 int                     wal_keep_segments = 0;
  92 int                     XLOGbuffers = -1;
  93 int                     XLogArchiveTimeout = 0;
  94 int                     XLogArchiveMode = ARCHIVE_MODE_OFF;
  95 char       *XLogArchiveCommand = NULL;
  96 bool            EnableHotStandby = false;
  97 bool            fullPageWrites = true;
  98 bool            wal_log_hints = false;
  99 bool            wal_compression = false;
 100 char       *wal_consistency_checking_string = NULL;
 101 bool       *wal_consistency_checking = NULL;
 102 bool            log_checkpoints = false;
 103 int                     sync_method = DEFAULT_SYNC_METHOD;
 104 int                     wal_level = WAL_LEVEL_MINIMAL;
 105 int                     CommitDelay = 0;        /* precommit delay in microseconds */
 106 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 107 int                     wal_retrieve_retry_interval = 5000;
 108
 109 #ifdef WAL_DEBUG
 110 bool            XLOG_DEBUG = false;
 111 #endif
 112
 113 /*
 114  * Number of WAL insertion locks to use. A higher value allows more insertions
 115  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
 116  * which needs to iterate all the locks.
 117  */
 118 #define NUM_XLOGINSERT_LOCKS  8
 119
 120 /*
 121  * Max distance from last checkpoint, before triggering a new xlog-based
 122  * checkpoint.
 123  */
 124 int                     CheckPointSegments;
 125
 126 /* Estimated distance between checkpoints, in bytes */
 127 static double CheckPointDistanceEstimate = 0;
 128 static double PrevCheckPointDistance = 0;
 129
 130 /*
 131  * GUC support
 132  */
 133 const struct config_enum_entry sync_method_options[] = {
 134         {"fsync", SYNC_METHOD_FSYNC, false},
 135 #ifdef HAVE_FSYNC_WRITETHROUGH
 136         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 137 #endif
 138 #ifdef HAVE_FDATASYNC
 139         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 140 #endif
 141 #ifdef OPEN_SYNC_FLAG
 142         {"open_sync", SYNC_METHOD_OPEN, false},
 143 #endif
 144 #ifdef OPEN_DATASYNC_FLAG
 145         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 146 #endif
 147         {NULL, 0, false}
 148 };
 149
 150
 151 /*
 152  * Although only "on", "off", and "always" are documented,
 153  * we accept all the likely variants of "on" and "off".
 154  */
 155 const struct config_enum_entry archive_mode_options[] = {
 156         {"always", ARCHIVE_MODE_ALWAYS, false},
 157         {"on", ARCHIVE_MODE_ON, false},
 158         {"off", ARCHIVE_MODE_OFF, false},
 159         {"true", ARCHIVE_MODE_ON, true},
 160         {"false", ARCHIVE_MODE_OFF, true},
 161         {"yes", ARCHIVE_MODE_ON, true},
 162         {"no", ARCHIVE_MODE_OFF, true},
 163         {"1", ARCHIVE_MODE_ON, true},
 164         {"0", ARCHIVE_MODE_OFF, true},
 165         {NULL, 0, false}
 166 };
 167
 168 /*
 169  * Statistics for current checkpoint are collected in this global struct.
 170  * Because only the checkpointer or a stand-alone backend can perform
 171  * checkpoints, this will be unused in normal backends.
 172  */
 173 CheckpointStatsData CheckpointStats;
 174
 175 /*
 176  * ThisTimeLineID will be same in all backends --- it identifies current
 177  * WAL timeline for the database system.
 178  */
 179 TimeLineID      ThisTimeLineID = 0;
 180
 181 /*
 182  * Are we doing recovery from XLOG?
 183  *
 184  * This is only ever true in the startup process; it should be read as meaning
 185  * "this process is replaying WAL records", rather than "the system is in
 186  * recovery mode".  It should be examined primarily by functions that need
 187  * to act differently when called from a WAL redo function (e.g., to skip WAL
 188  * logging).  To check whether the system is in recovery regardless of which
 189  * process you're running in, use RecoveryInProgress() but only after shared
 190  * memory startup and lock initialization.
 191  */
 192 bool            InRecovery = false;
 193
 194 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 195 HotStandbyState standbyState = STANDBY_DISABLED;
 196
 197 static XLogRecPtr LastRec;
 198
 199 /* Local copy of WalRcv->receivedUpto */
 200 static XLogRecPtr receivedUpto = 0;
 201 static TimeLineID receiveTLI = 0;
 202
 203 /*
 204  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 205  * the replayed WAL records indicate. It's initialized with full_page_writes
 206  * that the recovery starting checkpoint record indicates, and then updated
 207  * each time XLOG_FPW_CHANGE record is replayed.
 208  */
 209 static bool lastFullPageWrites;
 210
 211 /*
 212  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 213  * known, need to check the shared state".
 214  */
 215 static bool LocalRecoveryInProgress = true;
 216
 217 /*
 218  * Local copy of SharedHotStandbyActive variable. False actually means "not
 219  * known, need to check the shared state".
 220  */
 221 static bool LocalHotStandbyActive = false;
 222
 223 /*
 224  * Local state for XLogInsertAllowed():
 225  *              1: unconditionally allowed to insert XLOG
 226  *              0: unconditionally not allowed to insert XLOG
 227  *              -1: must check RecoveryInProgress(); disallow until it is false
 228  * Most processes start with -1 and transition to 1 after seeing that recovery
 229  * is not in progress.  But we can also force the value for special cases.
 230  * The coding in XLogInsertAllowed() depends on the first two of these states
 231  * being numerically the same as bool true and false.
 232  */
 233 static int      LocalXLogInsertAllowed = -1;
 234
 235 /*
 236  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 237  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 238  * currently recovering using offline XLOG archives. These variables are only
 239  * valid in the startup process.
 240  *
 241  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 242  * currently performing crash recovery using only XLOG files in pg_wal, but
 243  * will switch to using offline XLOG archives as soon as we reach the end of
 244  * WAL in pg_wal.
 245 */
 246 bool            ArchiveRecoveryRequested = false;
 247 bool            InArchiveRecovery = false;
 248
 249 /* Was the last xlog file restored from archive, or local? */
 250 static bool restoredFromArchive = false;
 251
 252 /* Buffers dedicated to consistency checks of size BLCKSZ */
 253 static char *replay_image_masked = NULL;
 254 static char *master_image_masked = NULL;
 255
 256 /* options taken from recovery.conf for archive recovery */
 257 char       *recoveryRestoreCommand = NULL;
 258 static char *recoveryEndCommand = NULL;
 259 static char *archiveCleanupCommand = NULL;
 260 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 261 static bool recoveryTargetInclusive = true;
 262 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
 263 static TransactionId recoveryTargetXid;
 264 static TimestampTz recoveryTargetTime;
 265 static char *recoveryTargetName;
 266 static XLogRecPtr recoveryTargetLSN;
 267 static int      recovery_min_apply_delay = 0;
 268 static TimestampTz recoveryDelayUntilTime;
 269
 270 /* options taken from recovery.conf for XLOG streaming */
 271 static bool StandbyModeRequested = false;
 272 static char *PrimaryConnInfo = NULL;
 273 static char *PrimarySlotName = NULL;
 274 static char *TriggerFile = NULL;
 275
 276 /* are we currently in standby mode? */
 277 bool            StandbyMode = false;
 278
 279 /* whether request for fast promotion has been made yet */
 280 static bool fast_promote = false;
 281
 282 /*
 283  * if recoveryStopsBefore/After returns true, it saves information of the stop
 284  * point here
 285  */
 286 static TransactionId recoveryStopXid;
 287 static TimestampTz recoveryStopTime;
 288 static XLogRecPtr recoveryStopLSN;
 289 static char recoveryStopName[MAXFNAMELEN];
 290 static bool recoveryStopAfter;
 291
 292 /*
 293  * During normal operation, the only timeline we care about is ThisTimeLineID.
 294  * During recovery, however, things are more complicated.  To simplify life
 295  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 296  * scan through the WAL history (that is, it is the line that was active when
 297  * the currently-scanned WAL record was generated).  We also need these
 298  * timeline values:
 299  *
 300  * recoveryTargetTLI: the desired timeline that we want to end in.
 301  *
 302  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 303  *
 304  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 305  * its known parents, newest first (so recoveryTargetTLI is always the
 306  * first list member).  Only these TLIs are expected to be seen in the WAL
 307  * segments we read, and indeed only these TLIs will be considered as
 308  * candidate WAL files to open at all.
 309  *
 310  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 311  * (This is not necessarily the same as ThisTimeLineID, because we could
 312  * be scanning data that was copied from an ancestor timeline when the current
 313  * file was created.)  During a sequential scan we do not allow this value
 314  * to decrease.
 315  */
 316 static TimeLineID recoveryTargetTLI;
 317 static bool recoveryTargetIsLatest = false;
 318 static List *expectedTLEs;
 319 static TimeLineID curFileTLI;
 320
 321 /*
 322  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 323  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 324  * end+1 of the last record, and is reset when we end a top-level transaction,
 325  * or start a new one; so it can be used to tell if the current transaction has
 326  * created any XLOG records.
 327  *
 328  * While in parallel mode, this may not be fully up to date.  When committing,
 329  * a transaction can assume this covers all xlog records written either by the
 330  * user backend or by any parallel worker which was present at any point during
 331  * the transaction.  But when aborting, or when still in parallel mode, other
 332  * parallel backends may have written WAL records at later LSNs than the value
 333  * stored here.  The parallel leader advances its own copy, when necessary,
 334  * in WaitForParallelWorkersToFinish.
 335  */
 336 XLogRecPtr      ProcLastRecPtr = InvalidXLogRecPtr;
 337 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 338 XLogRecPtr      XactLastCommitEnd = InvalidXLogRecPtr;
 339
 340 /*
 341  * RedoRecPtr is this backend's local copy of the REDO record pointer
 342  * (which is almost but not quite the same as a pointer to the most recent
 343  * CHECKPOINT record).  We update this from the shared-memory copy,
 344  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 345  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
 346  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 347  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 348  * InitXLOGAccess.
 349  */
 350 static XLogRecPtr RedoRecPtr;
 351
 352 /*
 353  * doPageWrites is this backend's local copy of (forcePageWrites ||
 354  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
 355  * a full-page image of a page need to be taken.
 356  */
 357 static bool doPageWrites;
 358
 359 /* Has the recovery code requested a walreceiver wakeup? */
 360 static bool doRequestWalReceiverReply;
 361
 362 /*
 363  * RedoStartLSN points to the checkpoint's REDO location which is specified
 364  * in a backup label file, backup history file or control file. In standby
 365  * mode, XLOG streaming usually starts from the position where an invalid
 366  * record was found. But if we fail to read even the initial checkpoint
 367  * record, we use the REDO location instead of the checkpoint location as
 368  * the start position of XLOG streaming. Otherwise we would have to jump
 369  * backwards to the REDO location after reading the checkpoint record,
 370  * because the REDO record can precede the checkpoint record.
 371  */
 372 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 373
 374 /*----------
 375  * Shared-memory data structures for XLOG control
 376  *
 377  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 378  * the log up to (all records before that point must be written or fsynced).
 379  * LogwrtResult indicates the byte positions we have already written/fsynced.
 380  * These structs are identical but are declared separately to indicate their
 381  * slightly different functions.
 382  *
 383  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 384  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 385  * this arrangement is that the value can be examined by code that already
 386  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 387  * to the shared variable, each backend has a private copy of LogwrtResult,
 388  * which is updated when convenient.
 389  *
 390  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 391  * (protected by info_lck), but we don't need to cache any copies of it.
 392  *
 393  * info_lck is only held long enough to read/update the protected variables,
 394  * so it's a plain spinlock.  The other locks are held longer (potentially
 395  * over I/O operations), so we use LWLocks for them.  These locks are:
 396  *
 397  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 398  * It is only held while initializing and changing the mapping.  If the
 399  * contents of the buffer being replaced haven't been written yet, the mapping
 400  * lock is released while the write is done, and reacquired afterwards.
 401  *
 402  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 403  * XLogFlush).
 404  *
 405  * ControlFileLock: must be held to read/update control file or create
 406  * new log file.
 407  *
 408  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 409  * only one checkpointer at a time; currently, with all checkpoints done by
 410  * the checkpointer, this is just pro forma).
 411  *
 412  *----------
 413  */
 414
 415 typedef struct XLogwrtRqst
 416 {
 417         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 418         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 419 } XLogwrtRqst;
 420
 421 typedef struct XLogwrtResult
 422 {
 423         XLogRecPtr      Write;                  /* last byte + 1 written out */
 424         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 425 } XLogwrtResult;
 426
 427 /*
 428  * Inserting to WAL is protected by a small fixed number of WAL insertion
 429  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 430  * matter which one. To lock out other concurrent insertions, you must hold
 431  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 432  * indicator of how far the insertion has progressed (insertingAt).
 433  *
 434  * The insertingAt values are read when a process wants to flush WAL from
 435  * the in-memory buffers to disk, to check that all the insertions to the
 436  * region the process is about to write out have finished. You could simply
 437  * wait for all currently in-progress insertions to finish, but the
 438  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 439  * so that you only wait for the insertions that are modifying the buffers
 440  * you're about to write out.
 441  *
 442  * This isn't just an optimization. If all the WAL buffers are dirty, an
 443  * inserter that's holding a WAL insert lock might need to evict an old WAL
 444  * buffer, which requires flushing the WAL. If it's possible for an inserter
 445  * to block on another inserter unnecessarily, deadlock can arise when two
 446  * inserters holding a WAL insert lock wait for each other to finish their
 447  * insertion.
 448  *
 449  * Small WAL records that don't cross a page boundary never update the value,
 450  * the WAL record is just copied to the page and the lock is released. But
 451  * to avoid the deadlock-scenario explained above, the indicator is always
 452  * updated before sleeping while holding an insertion lock.
 453  *
 454  * lastImportantAt contains the LSN of the last important WAL record inserted
 455  * using a given lock. This value is used to detect if there has been
 456  * important WAL activity since the last time some action, like a checkpoint,
 457  * was performed - allowing to not repeat the action if not. The LSN is
 458  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
 459  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
 460  * records.  Tracking the WAL activity directly in WALInsertLock has the
 461  * advantage of not needing any additional locks to update the value.
 462  */
 463 typedef struct
 464 {
 465         LWLock          lock;
 466         XLogRecPtr      insertingAt;
 467         XLogRecPtr      lastImportantAt;
 468 } WALInsertLock;
 469
 470 /*
 471  * All the WAL insertion locks are allocated as an array in shared memory. We
 472  * force the array stride to be a power of 2, which saves a few cycles in
 473  * indexing, but more importantly also ensures that individual slots don't
 474  * cross cache line boundaries. (Of course, we have to also ensure that the
 475  * array start address is suitably aligned.)
 476  */
 477 typedef union WALInsertLockPadded
 478 {
 479         WALInsertLock l;
 480         char            pad[PG_CACHE_LINE_SIZE];
 481 } WALInsertLockPadded;
 482
 483 /*
 484  * State of an exclusive backup, necessary to control concurrent activities
 485  * across sessions when working on exclusive backups.
 486  *
 487  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
 488  * running, to be more precise pg_start_backup() is not being executed for
 489  * an exclusive backup and there is no exclusive backup in progress.
 490  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
 491  * exclusive backup.
 492  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
 493  * running and an exclusive backup is in progress. pg_stop_backup() is
 494  * needed to finish it.
 495  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
 496  * exclusive backup.
 497  */
 498 typedef enum ExclusiveBackupState
 499 {
 500         EXCLUSIVE_BACKUP_NONE = 0,
 501         EXCLUSIVE_BACKUP_STARTING,
 502         EXCLUSIVE_BACKUP_IN_PROGRESS,
 503         EXCLUSIVE_BACKUP_STOPPING
 504 } ExclusiveBackupState;
 505
 506 /*
 507  * Shared state data for WAL insertion.
 508  */
 509 typedef struct XLogCtlInsert
 510 {
 511         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 512
 513         /*
 514          * CurrBytePos is the end of reserved WAL. The next record will be
 515          * inserted at that position. PrevBytePos is the start position of the
 516          * previously inserted (or rather, reserved) record - it is copied to the
 517          * prev-link of the next record. These are stored as "usable byte
 518          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 519          */
 520         uint64          CurrBytePos;
 521         uint64          PrevBytePos;
 522
 523         /*
 524          * Make sure the above heavily-contended spinlock and byte positions are
 525          * on their own cache line. In particular, the RedoRecPtr and full page
 526          * write variables below should be on a different cache line. They are
 527          * read on every WAL insertion, but updated rarely, and we don't want
 528          * those reads to steal the cache line containing Curr/PrevBytePos.
 529          */
 530         char            pad[PG_CACHE_LINE_SIZE];
 531
 532         /*
 533          * fullPageWrites is the master copy used by all backends to determine
 534          * whether to write full-page to WAL, instead of using process-local one.
 535          * This is required because, when full_page_writes is changed by SIGHUP,
 536          * we must WAL-log it before it actually affects WAL-logging by backends.
 537          * Checkpointer sets at startup or after SIGHUP.
 538          *
 539          * To read these fields, you must hold an insertion lock. To modify them,
 540          * you must hold ALL the locks.
 541          */
 542         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 543         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 544         bool            fullPageWrites;
 545
 546         /*
 547          * exclusiveBackupState indicates the state of an exclusive backup
 548          * (see comments of ExclusiveBackupState for more details).
 549          * nonExclusiveBackups is a counter indicating the number of streaming
 550          * base backups currently in progress. forcePageWrites is set to true
 551          * when either of these is non-zero. lastBackupStart is the latest
 552          * checkpoint redo location used as a starting point for an online
 553          * backup.
 554          */
 555         ExclusiveBackupState exclusiveBackupState;
 556         int                     nonExclusiveBackups;
 557         XLogRecPtr      lastBackupStart;
 558
 559         /*
 560          * WAL insertion locks.
 561          */
 562         WALInsertLockPadded *WALInsertLocks;
 563 } XLogCtlInsert;
 564
 565 /*
 566  * Total shared-memory state for XLOG.
 567  */
 568 typedef struct XLogCtlData
 569 {
 570         XLogCtlInsert Insert;
 571
 572         /* Protected by info_lck: */
 573         XLogwrtRqst LogwrtRqst;
 574         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 575         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 576         TransactionId ckptXid;
 577         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 578         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 579
 580         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 581                                                                                  * segment */
 582
 583         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 584         XLogRecPtr      unloggedLSN;
 585         slock_t         ulsn_lck;
 586
 587         /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
 588         pg_time_t       lastSegSwitchTime;
 589         XLogRecPtr      lastSegSwitchLSN;
 590
 591         /*
 592          * Protected by info_lck and WALWriteLock (you must hold either lock to
 593          * read it, but both to update)
 594          */
 595         XLogwrtResult LogwrtResult;
 596
 597         /*
 598          * Latest initialized page in the cache (last byte position + 1).
 599          *
 600          * To change the identity of a buffer (and InitializedUpTo), you need to
 601          * hold WALBufMappingLock.  To change the identity of a buffer that's
 602          * still dirty, the old page needs to be written out first, and for that
 603          * you need WALWriteLock, and you need to ensure that there are no
 604          * in-progress insertions to the page by calling
 605          * WaitXLogInsertionsToFinish().
 606          */
 607         XLogRecPtr      InitializedUpTo;
 608
 609         /*
 610          * These values do not change after startup, although the pointed-to pages
 611          * and xlblocks values certainly do.  xlblock values are protected by
 612          * WALBufMappingLock.
 613          */
 614         char       *pages;                      /* buffers for unwritten XLOG pages */
 615         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 616         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 617
 618         /*
 619          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 620          * If we created a new timeline when the system was started up,
 621          * PrevTimeLineID is the old timeline's ID that we forked off from.
 622          * Otherwise it's equal to ThisTimeLineID.
 623          */
 624         TimeLineID      ThisTimeLineID;
 625         TimeLineID      PrevTimeLineID;
 626
 627         /*
 628          * archiveCleanupCommand is read from recovery.conf but needs to be in
 629          * shared memory so that the checkpointer process can access it.
 630          */
 631         char            archiveCleanupCommand[MAXPGPATH];
 632
 633         /*
 634          * SharedRecoveryInProgress indicates if we're still in crash or archive
 635          * recovery.  Protected by info_lck.
 636          */
 637         bool            SharedRecoveryInProgress;
 638
 639         /*
 640          * SharedHotStandbyActive indicates if we're still in crash or archive
 641          * recovery.  Protected by info_lck.
 642          */
 643         bool            SharedHotStandbyActive;
 644
 645         /*
 646          * WalWriterSleeping indicates whether the WAL writer is currently in
 647          * low-power mode (and hence should be nudged if an async commit occurs).
 648          * Protected by info_lck.
 649          */
 650         bool            WalWriterSleeping;
 651
 652         /*
 653          * recoveryWakeupLatch is used to wake up the startup process to continue
 654          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 655          * to appear.
 656          */
 657         Latch           recoveryWakeupLatch;
 658
 659         /*
 660          * During recovery, we keep a copy of the latest checkpoint record here.
 661          * lastCheckPointRecPtr points to start of checkpoint record and
 662          * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
 663          * checkpointer when it wants to create a restartpoint.
 664          *
 665          * Protected by info_lck.
 666          */
 667         XLogRecPtr      lastCheckPointRecPtr;
 668         XLogRecPtr      lastCheckPointEndPtr;
 669         CheckPoint      lastCheckPoint;
 670
 671         /*
 672          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 673          * replayed. When we're currently replaying a record, ie. in a redo
 674          * function, replayEndRecPtr points to the end+1 of the record being
 675          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 676          */
 677         XLogRecPtr      lastReplayedEndRecPtr;
 678         TimeLineID      lastReplayedTLI;
 679         XLogRecPtr      replayEndRecPtr;
 680         TimeLineID      replayEndTLI;
 681         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 682         TimestampTz recoveryLastXTime;
 683
 684         /*
 685          * timestamp of when we started replaying the current chunk of WAL data,
 686          * only relevant for replication or archive recovery
 687          */
 688         TimestampTz currentChunkStartTime;
 689         /* Are we requested to pause recovery? */
 690         bool            recoveryPause;
 691
 692         /*
 693          * lastFpwDisableRecPtr points to the start of the last replayed
 694          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 695          */
 696         XLogRecPtr      lastFpwDisableRecPtr;
 697
 698         slock_t         info_lck;               /* locks shared variables shown above */
 699 } XLogCtlData;
 700
 701 static XLogCtlData *XLogCtl = NULL;
 702
 703 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 704 static WALInsertLockPadded *WALInsertLocks = NULL;
 705
 706 /*
 707  * We maintain an image of pg_control in shared memory.
 708  */
 709 static ControlFileData *ControlFile = NULL;
 710
 711 /*
 712  * Calculate the amount of space left on the page after 'endptr'. Beware
 713  * multiple evaluation!
 714  */
 715 #define INSERT_FREESPACE(endptr)        \
 716         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 717
 718 /* Macro to advance to next buffer index. */
 719 #define NextBufIdx(idx)         \
 720                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 721
 722 /*
 723  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 724  * would hold if it was in cache, the page containing 'recptr'.
 725  */
 726 #define XLogRecPtrToBufIdx(recptr)      \
 727         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 728
 729 /*
 730  * These are the number of bytes in a WAL page and segment usable for WAL data.
 731  */
 732 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 733 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 734
 735 /*
 736  * Private, possibly out-of-date copy of shared LogwrtResult.
 737  * See discussion above.
 738  */
 739 static XLogwrtResult LogwrtResult = {0, 0};
 740
 741 /*
 742  * Codes indicating where we got a WAL file from during recovery, or where
 743  * to attempt to get one.
 744  */
 745 typedef enum
 746 {
 747         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 748         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 749         XLOG_FROM_PG_WAL,                       /* existing file in pg_wal */
 750         XLOG_FROM_STREAM                        /* streamed from master */
 751 } XLogSource;
 752
 753 /* human-readable names for XLogSources, for debugging output */
 754 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
 755
 756 /*
 757  * openLogFile is -1 or a kernel FD for an open log file segment.
 758  * When it's open, openLogOff is the current seek offset in the file.
 759  * openLogSegNo identifies the segment.  These variables are only
 760  * used to write the XLOG, and so will normally refer to the active segment.
 761  */
 762 static int      openLogFile = -1;
 763 static XLogSegNo openLogSegNo = 0;
 764 static uint32 openLogOff = 0;
 765
 766 /*
 767  * These variables are used similarly to the ones above, but for reading
 768  * the XLOG.  Note, however, that readOff generally represents the offset
 769  * of the page just read, not the seek position of the FD itself, which
 770  * will be just past that page. readLen indicates how much of the current
 771  * page has been read into readBuf, and readSource indicates where we got
 772  * the currently open file from.
 773  */
 774 static int      readFile = -1;
 775 static XLogSegNo readSegNo = 0;
 776 static uint32 readOff = 0;
 777 static uint32 readLen = 0;
 778 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 779
 780 /*
 781  * Keeps track of which source we're currently reading from. This is
 782  * different from readSource in that this is always set, even when we don't
 783  * currently have a WAL file open. If lastSourceFailed is set, our last
 784  * attempt to read from currentSource failed, and we should try another source
 785  * next.
 786  */
 787 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 788 static bool lastSourceFailed = false;
 789
 790 typedef struct XLogPageReadPrivate
 791 {
 792         int                     emode;
 793         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 794         bool            randAccess;
 795 } XLogPageReadPrivate;
 796
 797 /*
 798  * These variables track when we last obtained some WAL data to process,
 799  * and where we got it from.  (XLogReceiptSource is initially the same as
 800  * readSource, but readSource gets reset to zero when we don't have data
 801  * to process right now.  It is also different from currentSource, which
 802  * also changes when we try to read from a source and fail, while
 803  * XLogReceiptSource tracks where we last successfully read some WAL.)
 804  */
 805 static TimestampTz XLogReceiptTime = 0;
 806 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 807
 808 /* State information for XLOG reading */
 809 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 810 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 811
 812 static XLogRecPtr minRecoveryPoint;             /* local copy of
 813                                                                                  * ControlFile->minRecoveryPoint */
 814 static TimeLineID minRecoveryPointTLI;
 815 static bool updateMinRecoveryPoint = true;
 816
 817 /*
 818  * Have we reached a consistent database state? In crash recovery, we have
 819  * to replay all the WAL, so reachedConsistency is never set. During archive
 820  * recovery, the database is consistent once minRecoveryPoint is reached.
 821  */
 822 bool            reachedConsistency = false;
 823
 824 static bool InRedo = false;
 825
 826 /* Have we launched bgwriter during recovery? */
 827 static bool bgwriterLaunched = false;
 828
 829 /* For WALInsertLockAcquire/Release functions */
 830 static int      MyLockNo = 0;
 831 static bool holdingAllLocks = false;
 832
 833 #ifdef WAL_DEBUG
 834 static MemoryContext walDebugCxt = NULL;
 835 #endif
 836
 837 static void readRecoveryCommandFile(void);
 838 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
 839 static bool recoveryStopsBefore(XLogReaderState *record);
 840 static bool recoveryStopsAfter(XLogReaderState *record);
 841 static void recoveryPausesHere(void);
 842 static bool recoveryApplyDelay(XLogReaderState *record);
 843 static void SetLatestXTime(TimestampTz xtime);
 844 static void SetCurrentChunkStartTime(TimestampTz xtime);
 845 static void CheckRequiredParameterValues(void);
 846 static void XLogReportParameters(void);
 847 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 848                                         TimeLineID prevTLI);
 849 static void LocalSetXLogInsertAllowed(void);
 850 static void CreateEndOfRecoveryRecord(void);
 851 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 852 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 853 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 854
 855 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 856 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 857 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 858 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 859                                            bool find_free, XLogSegNo max_segno,
 860                                            bool use_lock);
 861 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 862                          int source, bool notfoundOk);
 863 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 864 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 865                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 866                          TimeLineID *readTLI);
 867 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 868                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 869 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 870 static void XLogFileClose(void);
 871 static void PreallocXlogFiles(XLogRecPtr endptr);
 872 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
 873 static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
 874 static void UpdateLastRemovedPtr(char *filename);
 875 static void ValidateXLOGDirectoryStructure(void);
 876 static void CleanupBackupHistory(void);
 877 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 878 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 879                    int emode, bool fetching_ckpt);
 880 static void CheckRecoveryConsistency(void);
 881 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 882                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 883 static bool rescanLatestTimeLine(void);
 884 static void WriteControlFile(void);
 885 static void ReadControlFile(void);
 886 static char *str_time(pg_time_t tnow);
 887 static bool CheckForStandbyTrigger(void);
 888
 889 #ifdef WAL_DEBUG
 890 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 891 #endif
 892 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 893 static void pg_start_backup_callback(int code, Datum arg);
 894 static void pg_stop_backup_callback(int code, Datum arg);
 895 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 896                                   bool *backupEndRequired, bool *backupFromStandby);
 897 static bool read_tablespace_map(List **tablespaces);
 898
 899 static void rm_redo_error_callback(void *arg);
 900 static int      get_sync_bit(int method);
 901
 902 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 903                                         XLogRecData *rdata,
 904                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
 905 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 906                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 907 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 908                                   XLogRecPtr *PrevPtr);
 909 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 910 static char *GetXLogBuffer(XLogRecPtr ptr);
 911 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 912 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 913 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 914 static void checkXLogConsistency(XLogReaderState *record);
 915
 916 static void WALInsertLockAcquire(void);
 917 static void WALInsertLockAcquireExclusive(void);
 918 static void WALInsertLockRelease(void);
 919 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 920
 921 /*
 922  * Insert an XLOG record represented by an already-constructed chain of data
 923  * chunks.  This is a low-level routine; to construct the WAL record header
 924  * and data, use the higher-level routines in xloginsert.c.
 925  *
 926  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 927  * WAL record applies to, that were not included in the record as full page
 928  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
 929  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 930  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 931  * record is always inserted.
 932  *
 933  * 'flags' gives more in-depth control on the record being inserted. See
 934  * XLogSetRecordFlags() for details.
 935  *
 936  * The first XLogRecData in the chain must be for the record header, and its
 937  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 938  * xl_crc fields in the header, the rest of the header must already be filled
 939  * by the caller.
 940  *
 941  * Returns XLOG pointer to end of record (beginning of next record).
 942  * This can be used as LSN for data pages affected by the logged action.
 943  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 944  * before the data page can be written out.  This implements the basic
 945  * WAL rule "write the log before the data".)
 946  */
 947 XLogRecPtr
 948 XLogInsertRecord(XLogRecData *rdata,
 949                                  XLogRecPtr fpw_lsn,
 950                                  uint8 flags)
 951 {
 952         XLogCtlInsert *Insert = &XLogCtl->Insert;
 953         pg_crc32c       rdata_crc;
 954         bool            inserted;
 955         XLogRecord *rechdr = (XLogRecord *) rdata->data;
 956         uint8           info = rechdr->xl_info & ~XLR_INFO_MASK;
 957         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
 958                                                            info == XLOG_SWITCH);
 959         XLogRecPtr      StartPos;
 960         XLogRecPtr      EndPos;
 961
 962         /* we assume that all of the record header is in the first chunk */
 963         Assert(rdata->len >= SizeOfXLogRecord);
 964
 965         /* cross-check on whether we should be here or not */
 966         if (!XLogInsertAllowed())
 967                 elog(ERROR, "cannot make new WAL entries during recovery");
 968
 969         /*----------
 970          *
 971          * We have now done all the preparatory work we can without holding a
 972          * lock or modifying shared state. From here on, inserting the new WAL
 973          * record to the shared WAL buffer cache is a two-step process:
 974          *
 975          * 1. Reserve the right amount of space from the WAL. The current head of
 976          *        reserved space is kept in Insert->CurrBytePos, and is protected by
 977          *        insertpos_lck.
 978          *
 979          * 2. Copy the record to the reserved WAL space. This involves finding the
 980          *        correct WAL buffer containing the reserved space, and copying the
 981          *        record in place. This can be done concurrently in multiple processes.
 982          *
 983          * To keep track of which insertions are still in-progress, each concurrent
 984          * inserter acquires an insertion lock. In addition to just indicating that
 985          * an insertion is in progress, the lock tells others how far the inserter
 986          * has progressed. There is a small fixed number of insertion locks,
 987          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
 988          * boundary, it updates the value stored in the lock to the how far it has
 989          * inserted, to allow the previous buffer to be flushed.
 990          *
 991          * Holding onto an insertion lock also protects RedoRecPtr and
 992          * fullPageWrites from changing until the insertion is finished.
 993          *
 994          * Step 2 can usually be done completely in parallel. If the required WAL
 995          * page is not initialized yet, you have to grab WALBufMappingLock to
 996          * initialize it, but the WAL writer tries to do that ahead of insertions
 997          * to avoid that from happening in the critical path.
 998          *
 999          *----------
1000          */
1001         START_CRIT_SECTION();
1002         if (isLogSwitch)
1003                 WALInsertLockAcquireExclusive();
1004         else
1005                 WALInsertLockAcquire();
1006
1007         /*
1008          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
1009          * If so, may have to go back and have the caller recompute everything.
1010          * This can only happen just after a checkpoint, so it's better to be slow
1011          * in this case and fast otherwise.
1012          *
1013          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1014          * affect the contents of the XLOG record, so we'll update our local copy
1015          * but not force a recomputation.  (If doPageWrites was just turned off,
1016          * we could recompute the record without full pages, but we choose not to
1017          * bother.)
1018          */
1019         if (RedoRecPtr != Insert->RedoRecPtr)
1020         {
1021                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1022                 RedoRecPtr = Insert->RedoRecPtr;
1023         }
1024         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1025
1026         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
1027         {
1028                 /*
1029                  * Oops, some buffer now needs to be backed up that the caller didn't
1030                  * back up.  Start over.
1031                  */
1032                 WALInsertLockRelease();
1033                 END_CRIT_SECTION();
1034                 return InvalidXLogRecPtr;
1035         }
1036
1037         /*
1038          * Reserve space for the record in the WAL. This also sets the xl_prev
1039          * pointer.
1040          */
1041         if (isLogSwitch)
1042                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1043         else
1044         {
1045                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1046                                                                   &rechdr->xl_prev);
1047                 inserted = true;
1048         }
1049
1050         if (inserted)
1051         {
1052                 /*
1053                  * Now that xl_prev has been filled in, calculate CRC of the record
1054                  * header.
1055                  */
1056                 rdata_crc = rechdr->xl_crc;
1057                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1058                 FIN_CRC32C(rdata_crc);
1059                 rechdr->xl_crc = rdata_crc;
1060
1061                 /*
1062                  * All the record data, including the header, is now ready to be
1063                  * inserted. Copy the record in the space reserved.
1064                  */
1065                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1066                                                         StartPos, EndPos);
1067
1068                 /*
1069                  * Unless record is flagged as not important, update LSN of last
1070                  * important record in the current slot. When holding all locks, just
1071                  * update the first one.
1072                  */
1073                 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1074                 {
1075                         int lockno = holdingAllLocks ? 0 : MyLockNo;
1076
1077                         WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1078                 }
1079         }
1080         else
1081         {
1082                 /*
1083                  * This was an xlog-switch record, but the current insert location was
1084                  * already exactly at the beginning of a segment, so there was no need
1085                  * to do anything.
1086                  */
1087         }
1088
1089         /*
1090          * Done! Let others know that we're finished.
1091          */
1092         WALInsertLockRelease();
1093
1094         MarkCurrentTransactionIdLoggedIfAny();
1095
1096         END_CRIT_SECTION();
1097
1098         /*
1099          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1100          */
1101         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1102         {
1103                 SpinLockAcquire(&XLogCtl->info_lck);
1104                 /* advance global request to include new block(s) */
1105                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1106                         XLogCtl->LogwrtRqst.Write = EndPos;
1107                 /* update local result copy while I have the chance */
1108                 LogwrtResult = XLogCtl->LogwrtResult;
1109                 SpinLockRelease(&XLogCtl->info_lck);
1110         }
1111
1112         /*
1113          * If this was an XLOG_SWITCH record, flush the record and the empty
1114          * padding space that fills the rest of the segment, and perform
1115          * end-of-segment actions (eg, notifying archiver).
1116          */
1117         if (isLogSwitch)
1118         {
1119                 TRACE_POSTGRESQL_WAL_SWITCH();
1120                 XLogFlush(EndPos);
1121
1122                 /*
1123                  * Even though we reserved the rest of the segment for us, which is
1124                  * reflected in EndPos, we return a pointer to just the end of the
1125                  * xlog-switch record.
1126                  */
1127                 if (inserted)
1128                 {
1129                         EndPos = StartPos + SizeOfXLogRecord;
1130                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1131                         {
1132                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1133                                         EndPos += SizeOfXLogLongPHD;
1134                                 else
1135                                         EndPos += SizeOfXLogShortPHD;
1136                         }
1137                 }
1138         }
1139
1140 #ifdef WAL_DEBUG
1141         if (XLOG_DEBUG)
1142         {
1143                 static XLogReaderState *debug_reader = NULL;
1144                 StringInfoData buf;
1145                 StringInfoData recordBuf;
1146                 char       *errormsg = NULL;
1147                 MemoryContext oldCxt;
1148
1149                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1150
1151                 initStringInfo(&buf);
1152                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1153                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1154
1155                 /*
1156                  * We have to piece together the WAL record data from the XLogRecData
1157                  * entries, so that we can pass it to the rm_desc function as one
1158                  * contiguous chunk.
1159                  */
1160                 initStringInfo(&recordBuf);
1161                 for (; rdata != NULL; rdata = rdata->next)
1162                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1163
1164                 if (!debug_reader)
1165                         debug_reader = XLogReaderAllocate(NULL, NULL);
1166
1167                 if (!debug_reader)
1168                 {
1169                         appendStringInfoString(&buf, "error decoding record: out of memory");
1170                 }
1171                 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1172                                                                    &errormsg))
1173                 {
1174                         appendStringInfo(&buf, "error decoding record: %s",
1175                                                          errormsg ? errormsg : "no error message");
1176                 }
1177                 else
1178                 {
1179                         appendStringInfoString(&buf, " - ");
1180                         xlog_outdesc(&buf, debug_reader);
1181                 }
1182                 elog(LOG, "%s", buf.data);
1183
1184                 pfree(buf.data);
1185                 pfree(recordBuf.data);
1186                 MemoryContextSwitchTo(oldCxt);
1187         }
1188 #endif
1189
1190         /*
1191          * Update our global variables
1192          */
1193         ProcLastRecPtr = StartPos;
1194         XactLastRecEnd = EndPos;
1195
1196         return EndPos;
1197 }
1198
1199 /*
1200  * Reserves the right amount of space for a record of given size from the WAL.
1201  * *StartPos is set to the beginning of the reserved section, *EndPos to
1202  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1203  * used to set the xl_prev of this record.
1204  *
1205  * This is the performance critical part of XLogInsert that must be serialized
1206  * across backends. The rest can happen mostly in parallel. Try to keep this
1207  * section as short as possible, insertpos_lck can be heavily contended on a
1208  * busy system.
1209  *
1210  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1211  * where we actually copy the record to the reserved space.
1212  */
1213 static void
1214 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1215                                                   XLogRecPtr *PrevPtr)
1216 {
1217         XLogCtlInsert *Insert = &XLogCtl->Insert;
1218         uint64          startbytepos;
1219         uint64          endbytepos;
1220         uint64          prevbytepos;
1221
1222         size = MAXALIGN(size);
1223
1224         /* All (non xlog-switch) records should contain data. */
1225         Assert(size > SizeOfXLogRecord);
1226
1227         /*
1228          * The duration the spinlock needs to be held is minimized by minimizing
1229          * the calculations that have to be done while holding the lock. The
1230          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1231          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1232          * page headers. The mapping between "usable" byte positions and physical
1233          * positions (XLogRecPtrs) can be done outside the locked region, and
1234          * because the usable byte position doesn't include any headers, reserving
1235          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1236          */
1237         SpinLockAcquire(&Insert->insertpos_lck);
1238
1239         startbytepos = Insert->CurrBytePos;
1240         endbytepos = startbytepos + size;
1241         prevbytepos = Insert->PrevBytePos;
1242         Insert->CurrBytePos = endbytepos;
1243         Insert->PrevBytePos = startbytepos;
1244
1245         SpinLockRelease(&Insert->insertpos_lck);
1246
1247         *StartPos = XLogBytePosToRecPtr(startbytepos);
1248         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1249         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1250
1251         /*
1252          * Check that the conversions between "usable byte positions" and
1253          * XLogRecPtrs work consistently in both directions.
1254          */
1255         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1256         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1257         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1258 }
1259
1260 /*
1261  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1262  *
1263  * A log-switch record is handled slightly differently. The rest of the
1264  * segment will be reserved for this insertion, as indicated by the returned
1265  * *EndPos value. However, if we are already at the beginning of the current
1266  * segment, *StartPos and *EndPos are set to the current location without
1267  * reserving any space, and the function returns false.
1268 */
1269 static bool
1270 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1271 {
1272         XLogCtlInsert *Insert = &XLogCtl->Insert;
1273         uint64          startbytepos;
1274         uint64          endbytepos;
1275         uint64          prevbytepos;
1276         uint32          size = MAXALIGN(SizeOfXLogRecord);
1277         XLogRecPtr      ptr;
1278         uint32          segleft;
1279
1280         /*
1281          * These calculations are a bit heavy-weight to be done while holding a
1282          * spinlock, but since we're holding all the WAL insertion locks, there
1283          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1284          * compete for it, but that's not called very frequently.
1285          */
1286         SpinLockAcquire(&Insert->insertpos_lck);
1287
1288         startbytepos = Insert->CurrBytePos;
1289
1290         ptr = XLogBytePosToEndRecPtr(startbytepos);
1291         if (ptr % XLOG_SEG_SIZE == 0)
1292         {
1293                 SpinLockRelease(&Insert->insertpos_lck);
1294                 *EndPos = *StartPos = ptr;
1295                 return false;
1296         }
1297
1298         endbytepos = startbytepos + size;
1299         prevbytepos = Insert->PrevBytePos;
1300
1301         *StartPos = XLogBytePosToRecPtr(startbytepos);
1302         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1303
1304         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1305         if (segleft != XLOG_SEG_SIZE)
1306         {
1307                 /* consume the rest of the segment */
1308                 *EndPos += segleft;
1309                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1310         }
1311         Insert->CurrBytePos = endbytepos;
1312         Insert->PrevBytePos = startbytepos;
1313
1314         SpinLockRelease(&Insert->insertpos_lck);
1315
1316         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1317
1318         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1319         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1320         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1321         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1322
1323         return true;
1324 }
1325
1326 /*
1327  * Checks whether the current buffer page and backup page stored in the
1328  * WAL record are consistent or not. Before comparing the two pages, a
1329  * masking can be applied to the pages to ignore certain areas like hint bits,
1330  * unused space between pd_lower and pd_upper among other things. This
1331  * function should be called once WAL replay has been completed for a
1332  * given record.
1333  */
1334 static void
1335 checkXLogConsistency(XLogReaderState *record)
1336 {
1337         RmgrId          rmid = XLogRecGetRmid(record);
1338         RelFileNode rnode;
1339         ForkNumber      forknum;
1340         BlockNumber blkno;
1341         int                     block_id;
1342
1343         /* Records with no backup blocks have no need for consistency checks. */
1344         if (!XLogRecHasAnyBlockRefs(record))
1345                 return;
1346
1347         Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1348
1349         for (block_id = 0; block_id <= record->max_block_id; block_id++)
1350         {
1351                 Buffer          buf;
1352                 Page            page;
1353
1354                 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1355                 {
1356                         /*
1357                          * WAL record doesn't contain a block reference with the given id.
1358                          * Do nothing.
1359                          */
1360                         continue;
1361                 }
1362
1363                 Assert(XLogRecHasBlockImage(record, block_id));
1364
1365                 if (XLogRecBlockImageApply(record, block_id))
1366                 {
1367                         /*
1368                          * WAL record has already applied the page, so bypass the
1369                          * consistency check as that would result in comparing the full
1370                          * page stored in the record with itself.
1371                          */
1372                         continue;
1373                 }
1374
1375                 /*
1376                  * Read the contents from the current buffer and store it in a
1377                  * temporary page.
1378                  */
1379                 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1380                                                                          RBM_NORMAL_NO_LOG);
1381                 if (!BufferIsValid(buf))
1382                         continue;
1383
1384                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1385                 page = BufferGetPage(buf);
1386
1387                 /*
1388                  * Take a copy of the local page where WAL has been applied to have a
1389                  * comparison base before masking it...
1390                  */
1391                 memcpy(replay_image_masked, page, BLCKSZ);
1392
1393                 /* No need for this page anymore now that a copy is in. */
1394                 UnlockReleaseBuffer(buf);
1395
1396                 /*
1397                  * If the block LSN is already ahead of this WAL record, we can't
1398                  * expect contents to match.  This can happen if recovery is restarted.
1399                  */
1400                 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1401                         continue;
1402
1403                 /*
1404                  * Read the contents from the backup copy, stored in WAL record and
1405                  * store it in a temporary page. There is no need to allocate a new
1406                  * page here, a local buffer is fine to hold its contents and a mask
1407                  * can be directly applied on it.
1408                  */
1409                 if (!RestoreBlockImage(record, block_id, master_image_masked))
1410                         elog(ERROR, "failed to restore block image");
1411
1412                 /*
1413                  * If masking function is defined, mask both the master and replay
1414                  * images
1415                  */
1416                 if (RmgrTable[rmid].rm_mask != NULL)
1417                 {
1418                         RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1419                         RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1420                 }
1421
1422                 /* Time to compare the master and replay images. */
1423                 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1424                 {
1425                         elog(FATAL,
1426                            "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1427                                  rnode.spcNode, rnode.dbNode, rnode.relNode,
1428                                  forknum, blkno);
1429                 }
1430         }
1431 }
1432
1433 /*
1434  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1435  * area in the WAL.
1436  */
1437 static void
1438 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1439                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1440 {
1441         char       *currpos;
1442         int                     freespace;
1443         int                     written;
1444         XLogRecPtr      CurrPos;
1445         XLogPageHeader pagehdr;
1446
1447         /*
1448          * Get a pointer to the right place in the right WAL buffer to start
1449          * inserting to.
1450          */
1451         CurrPos = StartPos;
1452         currpos = GetXLogBuffer(CurrPos);
1453         freespace = INSERT_FREESPACE(CurrPos);
1454
1455         /*
1456          * there should be enough space for at least the first field (xl_tot_len)
1457          * on this page.
1458          */
1459         Assert(freespace >= sizeof(uint32));
1460
1461         /* Copy record data */
1462         written = 0;
1463         while (rdata != NULL)
1464         {
1465                 char       *rdata_data = rdata->data;
1466                 int                     rdata_len = rdata->len;
1467
1468                 while (rdata_len > freespace)
1469                 {
1470                         /*
1471                          * Write what fits on this page, and continue on the next page.
1472                          */
1473                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1474                         memcpy(currpos, rdata_data, freespace);
1475                         rdata_data += freespace;
1476                         rdata_len -= freespace;
1477                         written += freespace;
1478                         CurrPos += freespace;
1479
1480                         /*
1481                          * Get pointer to beginning of next page, and set the xlp_rem_len
1482                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1483                          *
1484                          * It's safe to set the contrecord flag and xlp_rem_len without a
1485                          * lock on the page. All the other flags were already set when the
1486                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1487                          * only backend that needs to set the contrecord flag.
1488                          */
1489                         currpos = GetXLogBuffer(CurrPos);
1490                         pagehdr = (XLogPageHeader) currpos;
1491                         pagehdr->xlp_rem_len = write_len - written;
1492                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1493
1494                         /* skip over the page header */
1495                         if (CurrPos % XLogSegSize == 0)
1496                         {
1497                                 CurrPos += SizeOfXLogLongPHD;
1498                                 currpos += SizeOfXLogLongPHD;
1499                         }
1500                         else
1501                         {
1502                                 CurrPos += SizeOfXLogShortPHD;
1503                                 currpos += SizeOfXLogShortPHD;
1504                         }
1505                         freespace = INSERT_FREESPACE(CurrPos);
1506                 }
1507
1508                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1509                 memcpy(currpos, rdata_data, rdata_len);
1510                 currpos += rdata_len;
1511                 CurrPos += rdata_len;
1512                 freespace -= rdata_len;
1513                 written += rdata_len;
1514
1515                 rdata = rdata->next;
1516         }
1517         Assert(written == write_len);
1518
1519         /*
1520          * If this was an xlog-switch, it's not enough to write the switch record,
1521          * we also have to consume all the remaining space in the WAL segment. We
1522          * have already reserved it for us, but we still need to make sure it's
1523          * allocated and zeroed in the WAL buffers so that when the caller (or
1524          * someone else) does XLogWrite(), it can really write out all the zeros.
1525          */
1526         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1527         {
1528                 /* An xlog-switch record doesn't contain any data besides the header */
1529                 Assert(write_len == SizeOfXLogRecord);
1530
1531                 /*
1532                  * We do this one page at a time, to make sure we don't deadlock
1533                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1534                  */
1535                 Assert(EndPos % XLogSegSize == 0);
1536
1537                 /* Use up all the remaining space on the first page */
1538                 CurrPos += freespace;
1539
1540                 while (CurrPos < EndPos)
1541                 {
1542                         /* initialize the next page (if not initialized already) */
1543                         WALInsertLockUpdateInsertingAt(CurrPos);
1544                         AdvanceXLInsertBuffer(CurrPos, false);
1545                         CurrPos += XLOG_BLCKSZ;
1546                 }
1547         }
1548         else
1549         {
1550                 /* Align the end position, so that the next record starts aligned */
1551                 CurrPos = MAXALIGN64(CurrPos);
1552         }
1553
1554         if (CurrPos != EndPos)
1555                 elog(PANIC, "space reserved for WAL record does not match what was written");
1556 }
1557
1558 /*
1559  * Acquire a WAL insertion lock, for inserting to WAL.
1560  */
1561 static void
1562 WALInsertLockAcquire(void)
1563 {
1564         bool            immed;
1565
1566         /*
1567          * It doesn't matter which of the WAL insertion locks we acquire, so try
1568          * the one we used last time.  If the system isn't particularly busy, it's
1569          * a good bet that it's still available, and it's good to have some
1570          * affinity to a particular lock so that you don't unnecessarily bounce
1571          * cache lines between processes when there's no contention.
1572          *
1573          * If this is the first time through in this backend, pick a lock
1574          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1575          * lot of very short connections.
1576          */
1577         static int      lockToTry = -1;
1578
1579         if (lockToTry == -1)
1580                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1581         MyLockNo = lockToTry;
1582
1583         /*
1584          * The insertingAt value is initially set to 0, as we don't know our
1585          * insert location yet.
1586          */
1587         immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1588         if (!immed)
1589         {
1590                 /*
1591                  * If we couldn't get the lock immediately, try another lock next
1592                  * time.  On a system with more insertion locks than concurrent
1593                  * inserters, this causes all the inserters to eventually migrate to a
1594                  * lock that no-one else is using.  On a system with more inserters
1595                  * than locks, it still helps to distribute the inserters evenly
1596                  * across the locks.
1597                  */
1598                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1599         }
1600 }
1601
1602 /*
1603  * Acquire all WAL insertion locks, to prevent other backends from inserting
1604  * to WAL.
1605  */
1606 static void
1607 WALInsertLockAcquireExclusive(void)
1608 {
1609         int                     i;
1610
1611         /*
1612          * When holding all the locks, all but the last lock's insertingAt
1613          * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1614          * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1615          */
1616         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1617         {
1618                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1619                 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1620                                                 &WALInsertLocks[i].l.insertingAt,
1621                                                 PG_UINT64_MAX);
1622         }
1623         /* Variable value reset to 0 at release */
1624         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1625
1626         holdingAllLocks = true;
1627 }
1628
1629 /*
1630  * Release our insertion lock (or locks, if we're holding them all).
1631  *
1632  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1633  * next time the lock is acquired.
1634  */
1635 static void
1636 WALInsertLockRelease(void)
1637 {
1638         if (holdingAllLocks)
1639         {
1640                 int                     i;
1641
1642                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1643                         LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1644                                                                   &WALInsertLocks[i].l.insertingAt,
1645                                                                   0);
1646
1647                 holdingAllLocks = false;
1648         }
1649         else
1650         {
1651                 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1652                                                           &WALInsertLocks[MyLockNo].l.insertingAt,
1653                                                           0);
1654         }
1655 }
1656
1657 /*
1658  * Update our insertingAt value, to let others know that we've finished
1659  * inserting up to that point.
1660  */
1661 static void
1662 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1663 {
1664         if (holdingAllLocks)
1665         {
1666                 /*
1667                  * We use the last lock to mark our actual position, see comments in
1668                  * WALInsertLockAcquireExclusive.
1669                  */
1670                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1671                                          &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1672                                                 insertingAt);
1673         }
1674         else
1675                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1676                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1677                                                 insertingAt);
1678 }
1679
1680 /*
1681  * Wait for any WAL insertions < upto to finish.
1682  *
1683  * Returns the location of the oldest insertion that is still in-progress.
1684  * Any WAL prior to that point has been fully copied into WAL buffers, and
1685  * can be flushed out to disk. Because this waits for any insertions older
1686  * than 'upto' to finish, the return value is always >= 'upto'.
1687  *
1688  * Note: When you are about to write out WAL, you must call this function
1689  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1690  * need to wait for an insertion to finish (or at least advance to next
1691  * uninitialized page), and the inserter might need to evict an old WAL buffer
1692  * to make room for a new one, which in turn requires WALWriteLock.
1693  */
1694 static XLogRecPtr
1695 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1696 {
1697         uint64          bytepos;
1698         XLogRecPtr      reservedUpto;
1699         XLogRecPtr      finishedUpto;
1700         XLogCtlInsert *Insert = &XLogCtl->Insert;
1701         int                     i;
1702
1703         if (MyProc == NULL)
1704                 elog(PANIC, "cannot wait without a PGPROC structure");
1705
1706         /* Read the current insert position */
1707         SpinLockAcquire(&Insert->insertpos_lck);
1708         bytepos = Insert->CurrBytePos;
1709         SpinLockRelease(&Insert->insertpos_lck);
1710         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1711
1712         /*
1713          * No-one should request to flush a piece of WAL that hasn't even been
1714          * reserved yet. However, it can happen if there is a block with a bogus
1715          * LSN on disk, for example. XLogFlush checks for that situation and
1716          * complains, but only after the flush. Here we just assume that to mean
1717          * that all WAL that has been reserved needs to be finished. In this
1718          * corner-case, the return value can be smaller than 'upto' argument.
1719          */
1720         if (upto > reservedUpto)
1721         {
1722                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1723                          (uint32) (upto >> 32), (uint32) upto,
1724                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1725                 upto = reservedUpto;
1726         }
1727
1728         /*
1729          * Loop through all the locks, sleeping on any in-progress insert older
1730          * than 'upto'.
1731          *
1732          * finishedUpto is our return value, indicating the point upto which all
1733          * the WAL insertions have been finished. Initialize it to the head of
1734          * reserved WAL, and as we iterate through the insertion locks, back it
1735          * out for any insertion that's still in progress.
1736          */
1737         finishedUpto = reservedUpto;
1738         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1739         {
1740                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1741
1742                 do
1743                 {
1744                         /*
1745                          * See if this insertion is in progress. LWLockWait will wait for
1746                          * the lock to be released, or for the 'value' to be set by a
1747                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1748                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1749                          * where it's inserting yet.  We will have to wait for it.  If
1750                          * it's a small insertion, the record will most likely fit on the
1751                          * same page and the inserter will release the lock without ever
1752                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1753                          * advertise the insertion point with LWLockUpdateVar before
1754                          * sleeping.
1755                          */
1756                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1757                                                                  &WALInsertLocks[i].l.insertingAt,
1758                                                                  insertingat, &insertingat))
1759                         {
1760                                 /* the lock was free, so no insertion in progress */
1761                                 insertingat = InvalidXLogRecPtr;
1762                                 break;
1763                         }
1764
1765                         /*
1766                          * This insertion is still in progress. Have to wait, unless the
1767                          * inserter has proceeded past 'upto'.
1768                          */
1769                 } while (insertingat < upto);
1770
1771                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1772                         finishedUpto = insertingat;
1773         }
1774         return finishedUpto;
1775 }
1776
1777 /*
1778  * Get a pointer to the right location in the WAL buffer containing the
1779  * given XLogRecPtr.
1780  *
1781  * If the page is not initialized yet, it is initialized. That might require
1782  * evicting an old dirty buffer from the buffer cache, which means I/O.
1783  *
1784  * The caller must ensure that the page containing the requested location
1785  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1786  * hold onto a WAL insertion lock with the insertingAt position set to
1787  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1788  * to evict an old page from the buffer. (This means that once you call
1789  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1790  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1791  * later, because older buffers might be recycled already)
1792  */
1793 static char *
1794 GetXLogBuffer(XLogRecPtr ptr)
1795 {
1796         int                     idx;
1797         XLogRecPtr      endptr;
1798         static uint64 cachedPage = 0;
1799         static char *cachedPos = NULL;
1800         XLogRecPtr      expectedEndPtr;
1801
1802         /*
1803          * Fast path for the common case that we need to access again the same
1804          * page as last time.
1805          */
1806         if (ptr / XLOG_BLCKSZ == cachedPage)
1807         {
1808                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1809                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1810                 return cachedPos + ptr % XLOG_BLCKSZ;
1811         }
1812
1813         /*
1814          * The XLog buffer cache is organized so that a page is always loaded to a
1815          * particular buffer.  That way we can easily calculate the buffer a given
1816          * page must be loaded into, from the XLogRecPtr alone.
1817          */
1818         idx = XLogRecPtrToBufIdx(ptr);
1819
1820         /*
1821          * See what page is loaded in the buffer at the moment. It could be the
1822          * page we're looking for, or something older. It can't be anything newer
1823          * - that would imply the page we're looking for has already been written
1824          * out to disk and evicted, and the caller is responsible for making sure
1825          * that doesn't happen.
1826          *
1827          * However, we don't hold a lock while we read the value. If someone has
1828          * just initialized the page, it's possible that we get a "torn read" of
1829          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1830          * that case we will see a bogus value. That's ok, we'll grab the mapping
1831          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1832          * the page we're looking for. But it means that when we do this unlocked
1833          * read, we might see a value that appears to be ahead of the page we're
1834          * looking for. Don't PANIC on that, until we've verified the value while
1835          * holding the lock.
1836          */
1837         expectedEndPtr = ptr;
1838         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1839
1840         endptr = XLogCtl->xlblocks[idx];
1841         if (expectedEndPtr != endptr)
1842         {
1843                 XLogRecPtr      initializedUpto;
1844
1845                 /*
1846                  * Before calling AdvanceXLInsertBuffer(), which can block, let others
1847                  * know how far we're finished with inserting the record.
1848                  *
1849                  * NB: If 'ptr' points to just after the page header, advertise a
1850                  * position at the beginning of the page rather than 'ptr' itself. If
1851                  * there are no other insertions running, someone might try to flush
1852                  * up to our advertised location. If we advertised a position after
1853                  * the page header, someone might try to flush the page header, even
1854                  * though page might actually not be initialized yet. As the first
1855                  * inserter on the page, we are effectively responsible for making
1856                  * sure that it's initialized, before we let insertingAt to move past
1857                  * the page header.
1858                  */
1859                 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1860                         ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
1861                         initializedUpto = ptr - SizeOfXLogShortPHD;
1862                 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1863                                  ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
1864                         initializedUpto = ptr - SizeOfXLogLongPHD;
1865                 else
1866                         initializedUpto = ptr;
1867
1868                 WALInsertLockUpdateInsertingAt(initializedUpto);
1869
1870                 AdvanceXLInsertBuffer(ptr, false);
1871                 endptr = XLogCtl->xlblocks[idx];
1872
1873                 if (expectedEndPtr != endptr)
1874                         elog(PANIC, "could not find WAL buffer for %X/%X",
1875                                  (uint32) (ptr >> 32), (uint32) ptr);
1876         }
1877         else
1878         {
1879                 /*
1880                  * Make sure the initialization of the page is visible to us, and
1881                  * won't arrive later to overwrite the WAL data we write on the page.
1882                  */
1883                 pg_memory_barrier();
1884         }
1885
1886         /*
1887          * Found the buffer holding this page. Return a pointer to the right
1888          * offset within the page.
1889          */
1890         cachedPage = ptr / XLOG_BLCKSZ;
1891         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1892
1893         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1894         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1895
1896         return cachedPos + ptr % XLOG_BLCKSZ;
1897 }
1898
1899 /*
1900  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1901  * is the position starting from the beginning of WAL, excluding all WAL
1902  * page headers.
1903  */
1904 static XLogRecPtr
1905 XLogBytePosToRecPtr(uint64 bytepos)
1906 {
1907         uint64          fullsegs;
1908         uint64          fullpages;
1909         uint64          bytesleft;
1910         uint32          seg_offset;
1911         XLogRecPtr      result;
1912
1913         fullsegs = bytepos / UsableBytesInSegment;
1914         bytesleft = bytepos % UsableBytesInSegment;
1915
1916         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1917         {
1918                 /* fits on first page of segment */
1919                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1920         }
1921         else
1922         {
1923                 /* account for the first page on segment with long header */
1924                 seg_offset = XLOG_BLCKSZ;
1925                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1926
1927                 fullpages = bytesleft / UsableBytesInPage;
1928                 bytesleft = bytesleft % UsableBytesInPage;
1929
1930                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1931         }
1932
1933         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1934
1935         return result;
1936 }
1937
1938 /*
1939  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1940  * returns a pointer to the beginning of the page (ie. before page header),
1941  * not to where the first xlog record on that page would go to. This is used
1942  * when converting a pointer to the end of a record.
1943  */
1944 static XLogRecPtr
1945 XLogBytePosToEndRecPtr(uint64 bytepos)
1946 {
1947         uint64          fullsegs;
1948         uint64          fullpages;
1949         uint64          bytesleft;
1950         uint32          seg_offset;
1951         XLogRecPtr      result;
1952
1953         fullsegs = bytepos / UsableBytesInSegment;
1954         bytesleft = bytepos % UsableBytesInSegment;
1955
1956         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1957         {
1958                 /* fits on first page of segment */
1959                 if (bytesleft == 0)
1960                         seg_offset = 0;
1961                 else
1962                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1963         }
1964         else
1965         {
1966                 /* account for the first page on segment with long header */
1967                 seg_offset = XLOG_BLCKSZ;
1968                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1969
1970                 fullpages = bytesleft / UsableBytesInPage;
1971                 bytesleft = bytesleft % UsableBytesInPage;
1972
1973                 if (bytesleft == 0)
1974                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1975                 else
1976                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1977         }
1978
1979         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1980
1981         return result;
1982 }
1983
1984 /*
1985  * Convert an XLogRecPtr to a "usable byte position".
1986  */
1987 static uint64
1988 XLogRecPtrToBytePos(XLogRecPtr ptr)
1989 {
1990         uint64          fullsegs;
1991         uint32          fullpages;
1992         uint32          offset;
1993         uint64          result;
1994
1995         XLByteToSeg(ptr, fullsegs);
1996
1997         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1998         offset = ptr % XLOG_BLCKSZ;
1999
2000         if (fullpages == 0)
2001         {
2002                 result = fullsegs * UsableBytesInSegment;
2003                 if (offset > 0)
2004                 {
2005                         Assert(offset >= SizeOfXLogLongPHD);
2006                         result += offset - SizeOfXLogLongPHD;
2007                 }
2008         }
2009         else
2010         {
2011                 result = fullsegs * UsableBytesInSegment +
2012                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2013                         (fullpages - 1) * UsableBytesInPage;            /* full pages */
2014                 if (offset > 0)
2015                 {
2016                         Assert(offset >= SizeOfXLogShortPHD);
2017                         result += offset - SizeOfXLogShortPHD;
2018                 }
2019         }
2020
2021         return result;
2022 }
2023
2024 /*
2025  * Initialize XLOG buffers, writing out old buffers if they still contain
2026  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2027  * true, initialize as many pages as we can without having to write out
2028  * unwritten data. Any new pages are initialized to zeros, with pages headers
2029  * initialized properly.
2030  */
2031 static void
2032 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2033 {
2034         XLogCtlInsert *Insert = &XLogCtl->Insert;
2035         int                     nextidx;
2036         XLogRecPtr      OldPageRqstPtr;
2037         XLogwrtRqst WriteRqst;
2038         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2039         XLogRecPtr      NewPageBeginPtr;
2040         XLogPageHeader NewPage;
2041         int                     npages = 0;
2042
2043         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2044
2045         /*
2046          * Now that we have the lock, check if someone initialized the page
2047          * already.
2048          */
2049         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2050         {
2051                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2052
2053                 /*
2054                  * Get ending-offset of the buffer page we need to replace (this may
2055                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2056                  * already written out.
2057                  */
2058                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2059                 if (LogwrtResult.Write < OldPageRqstPtr)
2060                 {
2061                         /*
2062                          * Nope, got work to do. If we just want to pre-initialize as much
2063                          * as we can without flushing, give up now.
2064                          */
2065                         if (opportunistic)
2066                                 break;
2067
2068                         /* Before waiting, get info_lck and update LogwrtResult */
2069                         SpinLockAcquire(&XLogCtl->info_lck);
2070                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2071                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2072                         LogwrtResult = XLogCtl->LogwrtResult;
2073                         SpinLockRelease(&XLogCtl->info_lck);
2074
2075                         /*
2076                          * Now that we have an up-to-date LogwrtResult value, see if we
2077                          * still need to write it or if someone else already did.
2078                          */
2079                         if (LogwrtResult.Write < OldPageRqstPtr)
2080                         {
2081                                 /*
2082                                  * Must acquire write lock. Release WALBufMappingLock first,
2083                                  * to make sure that all insertions that we need to wait for
2084                                  * can finish (up to this same position). Otherwise we risk
2085                                  * deadlock.
2086                                  */
2087                                 LWLockRelease(WALBufMappingLock);
2088
2089                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2090
2091                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2092
2093                                 LogwrtResult = XLogCtl->LogwrtResult;
2094                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2095                                 {
2096                                         /* OK, someone wrote it already */
2097                                         LWLockRelease(WALWriteLock);
2098                                 }
2099                                 else
2100                                 {
2101                                         /* Have to write it ourselves */
2102                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2103                                         WriteRqst.Write = OldPageRqstPtr;
2104                                         WriteRqst.Flush = 0;
2105                                         XLogWrite(WriteRqst, false);
2106                                         LWLockRelease(WALWriteLock);
2107                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2108                                 }
2109                                 /* Re-acquire WALBufMappingLock and retry */
2110                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2111                                 continue;
2112                         }
2113                 }
2114
2115                 /*
2116                  * Now the next buffer slot is free and we can set it up to be the
2117                  * next output page.
2118                  */
2119                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2120                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2121
2122                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2123
2124                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2125
2126                 /*
2127                  * Be sure to re-zero the buffer so that bytes beyond what we've
2128                  * written will look like zeroes and not valid XLOG records...
2129                  */
2130                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2131
2132                 /*
2133                  * Fill the new page's header
2134                  */
2135                 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2136
2137                 /* NewPage->xlp_info = 0; */    /* done by memset */
2138                 NewPage->xlp_tli = ThisTimeLineID;
2139                 NewPage->xlp_pageaddr = NewPageBeginPtr;
2140
2141                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2142
2143                 /*
2144                  * If online backup is not in progress, mark the header to indicate
2145                  * that* WAL records beginning in this page have removable backup
2146                  * blocks.  This allows the WAL archiver to know whether it is safe to
2147                  * compress archived WAL data by transforming full-block records into
2148                  * the non-full-block format.  It is sufficient to record this at the
2149                  * page level because we force a page switch (in fact a segment
2150                  * switch) when starting a backup, so the flag will be off before any
2151                  * records can be written during the backup.  At the end of a backup,
2152                  * the last page will be marked as all unsafe when perhaps only part
2153                  * is unsafe, but at worst the archiver would miss the opportunity to
2154                  * compress a few records.
2155                  */
2156                 if (!Insert->forcePageWrites)
2157                         NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2158
2159                 /*
2160                  * If first page of an XLOG segment file, make it a long header.
2161                  */
2162                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2163                 {
2164                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2165
2166                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2167                         NewLongPage->xlp_seg_size = XLogSegSize;
2168                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2169                         NewPage->xlp_info |= XLP_LONG_HEADER;
2170                 }
2171
2172                 /*
2173                  * Make sure the initialization of the page becomes visible to others
2174                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2175                  * holding a lock.
2176                  */
2177                 pg_write_barrier();
2178
2179                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2180
2181                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2182
2183                 npages++;
2184         }
2185         LWLockRelease(WALBufMappingLock);
2186
2187 #ifdef WAL_DEBUG
2188         if (XLOG_DEBUG && npages > 0)
2189         {
2190                 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2191                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2192         }
2193 #endif
2194 }
2195
2196 /*
2197  * Calculate CheckPointSegments based on max_wal_size and
2198  * checkpoint_completion_target.
2199  */
2200 static void
2201 CalculateCheckpointSegments(void)
2202 {
2203         double          target;
2204
2205         /*-------
2206          * Calculate the distance at which to trigger a checkpoint, to avoid
2207          * exceeding max_wal_size. This is based on two assumptions:
2208          *
2209          * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
2210          * b) during checkpoint, we consume checkpoint_completion_target *
2211          *        number of segments consumed between checkpoints.
2212          *-------
2213          */
2214         target = (double) max_wal_size / (2.0 + CheckPointCompletionTarget);
2215
2216         /* round down */
2217         CheckPointSegments = (int) target;
2218
2219         if (CheckPointSegments < 1)
2220                 CheckPointSegments = 1;
2221 }
2222
2223 void
2224 assign_max_wal_size(int newval, void *extra)
2225 {
2226         max_wal_size = newval;
2227         CalculateCheckpointSegments();
2228 }
2229
2230 void
2231 assign_checkpoint_completion_target(double newval, void *extra)
2232 {
2233         CheckPointCompletionTarget = newval;
2234         CalculateCheckpointSegments();
2235 }
2236
2237 /*
2238  * At a checkpoint, how many WAL segments to recycle as preallocated future
2239  * XLOG segments? Returns the highest segment that should be preallocated.
2240  */
2241 static XLogSegNo
2242 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2243 {
2244         XLogSegNo       minSegNo;
2245         XLogSegNo       maxSegNo;
2246         double          distance;
2247         XLogSegNo       recycleSegNo;
2248
2249         /*
2250          * Calculate the segment numbers that min_wal_size and max_wal_size
2251          * correspond to. Always recycle enough segments to meet the minimum, and
2252          * remove enough segments to stay below the maximum.
2253          */
2254         minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
2255         maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
2256
2257         /*
2258          * Between those limits, recycle enough segments to get us through to the
2259          * estimated end of next checkpoint.
2260          *
2261          * To estimate where the next checkpoint will finish, assume that the
2262          * system runs steadily consuming CheckPointDistanceEstimate bytes between
2263          * every checkpoint.
2264          *
2265          * The reason this calculation is done from the prior checkpoint, not the
2266          * one that just finished, is that this behaves better if some checkpoint
2267          * cycles are abnormally short, like if you perform a manual checkpoint
2268          * right after a timed one. The manual checkpoint will make almost a full
2269          * cycle's worth of WAL segments available for recycling, because the
2270          * segments from the prior's prior, fully-sized checkpoint cycle are no
2271          * longer needed. However, the next checkpoint will make only few segments
2272          * available for recycling, the ones generated between the timed
2273          * checkpoint and the manual one right after that. If at the manual
2274          * checkpoint we only retained enough segments to get us to the next timed
2275          * one, and removed the rest, then at the next checkpoint we would not
2276          * have enough segments around for recycling, to get us to the checkpoint
2277          * after that. Basing the calculations on the distance from the prior redo
2278          * pointer largely fixes that problem.
2279          */
2280         distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2281         /* add 10% for good measure. */
2282         distance *= 1.10;
2283
2284         recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
2285
2286         if (recycleSegNo < minSegNo)
2287                 recycleSegNo = minSegNo;
2288         if (recycleSegNo > maxSegNo)
2289                 recycleSegNo = maxSegNo;
2290
2291         return recycleSegNo;
2292 }
2293
2294 /*
2295  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2296  *
2297  * new_segno indicates a log file that has just been filled up (or read
2298  * during recovery). We measure the distance from RedoRecPtr to new_segno
2299  * and see if that exceeds CheckPointSegments.
2300  *
2301  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2302  */
2303 static bool
2304 XLogCheckpointNeeded(XLogSegNo new_segno)
2305 {
2306         XLogSegNo       old_segno;
2307
2308         XLByteToSeg(RedoRecPtr, old_segno);
2309
2310         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2311                 return true;
2312         return false;
2313 }
2314
2315 /*
2316  * Write and/or fsync the log at least as far as WriteRqst indicates.
2317  *
2318  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2319  * may stop at any convenient boundary (such as a cache or logfile boundary).
2320  * This option allows us to avoid uselessly issuing multiple writes when a
2321  * single one would do.
2322  *
2323  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2324  * must be called before grabbing the lock, to make sure the data is ready to
2325  * write.
2326  */
2327 static void
2328 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2329 {
2330         bool            ispartialpage;
2331         bool            last_iteration;
2332         bool            finishing_seg;
2333         bool            use_existent;
2334         int                     curridx;
2335         int                     npages;
2336         int                     startidx;
2337         uint32          startoffset;
2338
2339         /* We should always be inside a critical section here */
2340         Assert(CritSectionCount > 0);
2341
2342         /*
2343          * Update local LogwrtResult (caller probably did this already, but...)
2344          */
2345         LogwrtResult = XLogCtl->LogwrtResult;
2346
2347         /*
2348          * Since successive pages in the xlog cache are consecutively allocated,
2349          * we can usually gather multiple pages together and issue just one
2350          * write() call.  npages is the number of pages we have determined can be
2351          * written together; startidx is the cache block index of the first one,
2352          * and startoffset is the file offset at which it should go. The latter
2353          * two variables are only valid when npages > 0, but we must initialize
2354          * all of them to keep the compiler quiet.
2355          */
2356         npages = 0;
2357         startidx = 0;
2358         startoffset = 0;
2359
2360         /*
2361          * Within the loop, curridx is the cache block index of the page to
2362          * consider writing.  Begin at the buffer containing the next unwritten
2363          * page, or last partially written page.
2364          */
2365         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2366
2367         while (LogwrtResult.Write < WriteRqst.Write)
2368         {
2369                 /*
2370                  * Make sure we're not ahead of the insert process.  This could happen
2371                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2372                  * last page that's been initialized by AdvanceXLInsertBuffer.
2373                  */
2374                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2375
2376                 if (LogwrtResult.Write >= EndPtr)
2377                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2378                                  (uint32) (LogwrtResult.Write >> 32),
2379                                  (uint32) LogwrtResult.Write,
2380                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2381
2382                 /* Advance LogwrtResult.Write to end of current buffer page */
2383                 LogwrtResult.Write = EndPtr;
2384                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2385
2386                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2387                 {
2388                         /*
2389                          * Switch to new logfile segment.  We cannot have any pending
2390                          * pages here (since we dump what we have at segment end).
2391                          */
2392                         Assert(npages == 0);
2393                         if (openLogFile >= 0)
2394                                 XLogFileClose();
2395                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2396
2397                         /* create/use new log file */
2398                         use_existent = true;
2399                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2400                         openLogOff = 0;
2401                 }
2402
2403                 /* Make sure we have the current logfile open */
2404                 if (openLogFile < 0)
2405                 {
2406                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2407                         openLogFile = XLogFileOpen(openLogSegNo);
2408                         openLogOff = 0;
2409                 }
2410
2411                 /* Add current page to the set of pending pages-to-dump */
2412                 if (npages == 0)
2413                 {
2414                         /* first of group */
2415                         startidx = curridx;
2416                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2417                 }
2418                 npages++;
2419
2420                 /*
2421                  * Dump the set if this will be the last loop iteration, or if we are
2422                  * at the last page of the cache area (since the next page won't be
2423                  * contiguous in memory), or if we are at the end of the logfile
2424                  * segment.
2425                  */
2426                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2427
2428                 finishing_seg = !ispartialpage &&
2429                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2430
2431                 if (last_iteration ||
2432                         curridx == XLogCtl->XLogCacheBlck ||
2433                         finishing_seg)
2434                 {
2435                         char       *from;
2436                         Size            nbytes;
2437                         Size            nleft;
2438                         int                     written;
2439
2440                         /* Need to seek in the file? */
2441                         if (openLogOff != startoffset)
2442                         {
2443                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2444                                         ereport(PANIC,
2445                                                         (errcode_for_file_access(),
2446                                          errmsg("could not seek in log file %s to offset %u: %m",
2447                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2448                                                         startoffset)));
2449                                 openLogOff = startoffset;
2450                         }
2451
2452                         /* OK to write the page(s) */
2453                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2454                         nbytes = npages * (Size) XLOG_BLCKSZ;
2455                         nleft = nbytes;
2456                         do
2457                         {
2458                                 errno = 0;
2459                                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2460                                 written = write(openLogFile, from, nleft);
2461                                 pgstat_report_wait_end();
2462                                 if (written <= 0)
2463                                 {
2464                                         if (errno == EINTR)
2465                                                 continue;
2466                                         ereport(PANIC,
2467                                                         (errcode_for_file_access(),
2468                                                          errmsg("could not write to log file %s "
2469                                                                         "at offset %u, length %zu: %m",
2470                                                                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2471                                                                         openLogOff, nbytes)));
2472                                 }
2473                                 nleft -= written;
2474                                 from += written;
2475                         } while (nleft > 0);
2476
2477                         /* Update state for write */
2478                         openLogOff += nbytes;
2479                         npages = 0;
2480
2481                         /*
2482                          * If we just wrote the whole last page of a logfile segment,
2483                          * fsync the segment immediately.  This avoids having to go back
2484                          * and re-open prior segments when an fsync request comes along
2485                          * later. Doing it here ensures that one and only one backend will
2486                          * perform this fsync.
2487                          *
2488                          * This is also the right place to notify the Archiver that the
2489                          * segment is ready to copy to archival storage, and to update the
2490                          * timer for archive_timeout, and to signal for a checkpoint if
2491                          * too many logfile segments have been used since the last
2492                          * checkpoint.
2493                          */
2494                         if (finishing_seg)
2495                         {
2496                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2497
2498                                 /* signal that we need to wakeup walsenders later */
2499                                 WalSndWakeupRequest();
2500
2501                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2502
2503                                 if (XLogArchivingActive())
2504                                         XLogArchiveNotifySeg(openLogSegNo);
2505
2506                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2507                                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2508
2509                                 /*
2510                                  * Request a checkpoint if we've consumed too much xlog since
2511                                  * the last one.  For speed, we first check using the local
2512                                  * copy of RedoRecPtr, which might be out of date; if it looks
2513                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2514                                  * recheck.
2515                                  */
2516                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2517                                 {
2518                                         (void) GetRedoRecPtr();
2519                                         if (XLogCheckpointNeeded(openLogSegNo))
2520                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2521                                 }
2522                         }
2523                 }
2524
2525                 if (ispartialpage)
2526                 {
2527                         /* Only asked to write a partial page */
2528                         LogwrtResult.Write = WriteRqst.Write;
2529                         break;
2530                 }
2531                 curridx = NextBufIdx(curridx);
2532
2533                 /* If flexible, break out of loop as soon as we wrote something */
2534                 if (flexible && npages == 0)
2535                         break;
2536         }
2537
2538         Assert(npages == 0);
2539
2540         /*
2541          * If asked to flush, do so
2542          */
2543         if (LogwrtResult.Flush < WriteRqst.Flush &&
2544                 LogwrtResult.Flush < LogwrtResult.Write)
2545
2546         {
2547                 /*
2548                  * Could get here without iterating above loop, in which case we might
2549                  * have no open file or the wrong one.  However, we do not need to
2550                  * fsync more than one file.
2551                  */
2552                 if (sync_method != SYNC_METHOD_OPEN &&
2553                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2554                 {
2555                         if (openLogFile >= 0 &&
2556                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2557                                 XLogFileClose();
2558                         if (openLogFile < 0)
2559                         {
2560                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2561                                 openLogFile = XLogFileOpen(openLogSegNo);
2562                                 openLogOff = 0;
2563                         }
2564
2565                         issue_xlog_fsync(openLogFile, openLogSegNo);
2566                 }
2567
2568                 /* signal that we need to wakeup walsenders later */
2569                 WalSndWakeupRequest();
2570
2571                 LogwrtResult.Flush = LogwrtResult.Write;
2572         }
2573
2574         /*
2575          * Update shared-memory status
2576          *
2577          * We make sure that the shared 'request' values do not fall behind the
2578          * 'result' values.  This is not absolutely essential, but it saves some
2579          * code in a couple of places.
2580          */
2581         {
2582                 SpinLockAcquire(&XLogCtl->info_lck);
2583                 XLogCtl->LogwrtResult = LogwrtResult;
2584                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2585                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2586                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2587                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2588                 SpinLockRelease(&XLogCtl->info_lck);
2589         }
2590 }
2591
2592 /*
2593  * Record the LSN for an asynchronous transaction commit/abort
2594  * and nudge the WALWriter if there is work for it to do.
2595  * (This should not be called for synchronous commits.)
2596  */
2597 void
2598 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2599 {
2600         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2601         bool            sleeping;
2602
2603         SpinLockAcquire(&XLogCtl->info_lck);
2604         LogwrtResult = XLogCtl->LogwrtResult;
2605         sleeping = XLogCtl->WalWriterSleeping;
2606         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2607                 XLogCtl->asyncXactLSN = asyncXactLSN;
2608         SpinLockRelease(&XLogCtl->info_lck);
2609
2610         /*
2611          * If the WALWriter is sleeping, we should kick it to make it come out of
2612          * low-power mode.  Otherwise, determine whether there's a full page of
2613          * WAL available to write.
2614          */
2615         if (!sleeping)
2616         {
2617                 /* back off to last completed page boundary */
2618                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2619
2620                 /* if we have already flushed that far, we're done */
2621                 if (WriteRqstPtr <= LogwrtResult.Flush)
2622                         return;
2623         }
2624
2625         /*
2626          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2627          * to come out of low-power mode so that this async commit will reach disk
2628          * within the expected amount of time.
2629          */
2630         if (ProcGlobal->walwriterLatch)
2631                 SetLatch(ProcGlobal->walwriterLatch);
2632 }
2633
2634 /*
2635  * Record the LSN up to which we can remove WAL because it's not required by
2636  * any replication slot.
2637  */
2638 void
2639 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2640 {
2641         SpinLockAcquire(&XLogCtl->info_lck);
2642         XLogCtl->replicationSlotMinLSN = lsn;
2643         SpinLockRelease(&XLogCtl->info_lck);
2644 }
2645
2646
2647 /*
2648  * Return the oldest LSN we must retain to satisfy the needs of some
2649  * replication slot.
2650  */
2651 static XLogRecPtr
2652 XLogGetReplicationSlotMinimumLSN(void)
2653 {
2654         XLogRecPtr      retval;
2655
2656         SpinLockAcquire(&XLogCtl->info_lck);
2657         retval = XLogCtl->replicationSlotMinLSN;
2658         SpinLockRelease(&XLogCtl->info_lck);
2659
2660         return retval;
2661 }
2662
2663 /*
2664  * Advance minRecoveryPoint in control file.
2665  *
2666  * If we crash during recovery, we must reach this point again before the
2667  * database is consistent.
2668  *
2669  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2670  * is only updated if it's not already greater than or equal to 'lsn'.
2671  */
2672 static void
2673 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2674 {
2675         /* Quick check using our local copy of the variable */
2676         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2677                 return;
2678
2679         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2680
2681         /* update local copy */
2682         minRecoveryPoint = ControlFile->minRecoveryPoint;
2683         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2684
2685         /*
2686          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2687          * i.e., we're doing crash recovery.  We never modify the control file's
2688          * value in that case, so we can short-circuit future checks here too.
2689          */
2690         if (minRecoveryPoint == 0)
2691                 updateMinRecoveryPoint = false;
2692         else if (force || minRecoveryPoint < lsn)
2693         {
2694                 XLogRecPtr      newMinRecoveryPoint;
2695                 TimeLineID      newMinRecoveryPointTLI;
2696
2697                 /*
2698                  * To avoid having to update the control file too often, we update it
2699                  * all the way to the last record being replayed, even though 'lsn'
2700                  * would suffice for correctness.  This also allows the 'force' case
2701                  * to not need a valid 'lsn' value.
2702                  *
2703                  * Another important reason for doing it this way is that the passed
2704                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2705                  * the caller got it from a corrupted heap page.  Accepting such a
2706                  * value as the min recovery point would prevent us from coming up at
2707                  * all.  Instead, we just log a warning and continue with recovery.
2708                  * (See also the comments about corrupt LSNs in XLogFlush.)
2709                  */
2710                 SpinLockAcquire(&XLogCtl->info_lck);
2711                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2712                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2713                 SpinLockRelease(&XLogCtl->info_lck);
2714
2715                 if (!force && newMinRecoveryPoint < lsn)
2716                         elog(WARNING,
2717                            "xlog min recovery request %X/%X is past current point %X/%X",
2718                                  (uint32) (lsn >> 32), (uint32) lsn,
2719                                  (uint32) (newMinRecoveryPoint >> 32),
2720                                  (uint32) newMinRecoveryPoint);
2721
2722                 /* update control file */
2723                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2724                 {
2725                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2726                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2727                         UpdateControlFile();
2728                         minRecoveryPoint = newMinRecoveryPoint;
2729                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2730
2731                         ereport(DEBUG2,
2732                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2733                                                 (uint32) (minRecoveryPoint >> 32),
2734                                                 (uint32) minRecoveryPoint,
2735                                                 newMinRecoveryPointTLI)));
2736                 }
2737         }
2738         LWLockRelease(ControlFileLock);
2739 }
2740
2741 /*
2742  * Ensure that all XLOG data through the given position is flushed to disk.
2743  *
2744  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2745  * already held, and we try to avoid acquiring it if possible.
2746  */
2747 void
2748 XLogFlush(XLogRecPtr record)
2749 {
2750         XLogRecPtr      WriteRqstPtr;
2751         XLogwrtRqst WriteRqst;
2752
2753         /*
2754          * During REDO, we are reading not writing WAL.  Therefore, instead of
2755          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2756          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2757          * to act this way too, and because when it tries to write the
2758          * end-of-recovery checkpoint, it should indeed flush.
2759          */
2760         if (!XLogInsertAllowed())
2761         {
2762                 UpdateMinRecoveryPoint(record, false);
2763                 return;
2764         }
2765
2766         /* Quick exit if already known flushed */
2767         if (record <= LogwrtResult.Flush)
2768                 return;
2769
2770 #ifdef WAL_DEBUG
2771         if (XLOG_DEBUG)
2772                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2773                          (uint32) (record >> 32), (uint32) record,
2774                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2775                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2776 #endif
2777
2778         START_CRIT_SECTION();
2779
2780         /*
2781          * Since fsync is usually a horribly expensive operation, we try to
2782          * piggyback as much data as we can on each fsync: if we see any more data
2783          * entered into the xlog buffer, we'll write and fsync that too, so that
2784          * the final value of LogwrtResult.Flush is as large as possible. This
2785          * gives us some chance of avoiding another fsync immediately after.
2786          */
2787
2788         /* initialize to given target; may increase below */
2789         WriteRqstPtr = record;
2790
2791         /*
2792          * Now wait until we get the write lock, or someone else does the flush
2793          * for us.
2794          */
2795         for (;;)
2796         {
2797                 XLogRecPtr      insertpos;
2798
2799                 /* read LogwrtResult and update local state */
2800                 SpinLockAcquire(&XLogCtl->info_lck);
2801                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2802                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2803                 LogwrtResult = XLogCtl->LogwrtResult;
2804                 SpinLockRelease(&XLogCtl->info_lck);
2805
2806                 /* done already? */
2807                 if (record <= LogwrtResult.Flush)
2808                         break;
2809
2810                 /*
2811                  * Before actually performing the write, wait for all in-flight
2812                  * insertions to the pages we're about to write to finish.
2813                  */
2814                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2815
2816                 /*
2817                  * Try to get the write lock. If we can't get it immediately, wait
2818                  * until it's released, and recheck if we still need to do the flush
2819                  * or if the backend that held the lock did it for us already. This
2820                  * helps to maintain a good rate of group committing when the system
2821                  * is bottlenecked by the speed of fsyncing.
2822                  */
2823                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2824                 {
2825                         /*
2826                          * The lock is now free, but we didn't acquire it yet. Before we
2827                          * do, loop back to check if someone else flushed the record for
2828                          * us already.
2829                          */
2830                         continue;
2831                 }
2832
2833                 /* Got the lock; recheck whether request is satisfied */
2834                 LogwrtResult = XLogCtl->LogwrtResult;
2835                 if (record <= LogwrtResult.Flush)
2836                 {
2837                         LWLockRelease(WALWriteLock);
2838                         break;
2839                 }
2840
2841                 /*
2842                  * Sleep before flush! By adding a delay here, we may give further
2843                  * backends the opportunity to join the backlog of group commit
2844                  * followers; this can significantly improve transaction throughput,
2845                  * at the risk of increasing transaction latency.
2846                  *
2847                  * We do not sleep if enableFsync is not turned on, nor if there are
2848                  * fewer than CommitSiblings other backends with active transactions.
2849                  */
2850                 if (CommitDelay > 0 && enableFsync &&
2851                         MinimumActiveBackends(CommitSiblings))
2852                 {
2853                         pg_usleep(CommitDelay);
2854
2855                         /*
2856                          * Re-check how far we can now flush the WAL. It's generally not
2857                          * safe to call WaitXLogInsertionsToFinish while holding
2858                          * WALWriteLock, because an in-progress insertion might need to
2859                          * also grab WALWriteLock to make progress. But we know that all
2860                          * the insertions up to insertpos have already finished, because
2861                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2862                          * We're only calling it again to allow insertpos to be moved
2863                          * further forward, not to actually wait for anyone.
2864                          */
2865                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2866                 }
2867
2868                 /* try to write/flush later additions to XLOG as well */
2869                 WriteRqst.Write = insertpos;
2870                 WriteRqst.Flush = insertpos;
2871
2872                 XLogWrite(WriteRqst, false);
2873
2874                 LWLockRelease(WALWriteLock);
2875                 /* done */
2876                 break;
2877         }
2878
2879         END_CRIT_SECTION();
2880
2881         /* wake up walsenders now that we've released heavily contended locks */
2882         WalSndWakeupProcessRequests();
2883
2884         /*
2885          * If we still haven't flushed to the request point then we have a
2886          * problem; most likely, the requested flush point is past end of XLOG.
2887          * This has been seen to occur when a disk page has a corrupted LSN.
2888          *
2889          * Formerly we treated this as a PANIC condition, but that hurts the
2890          * system's robustness rather than helping it: we do not want to take down
2891          * the whole system due to corruption on one data page.  In particular, if
2892          * the bad page is encountered again during recovery then we would be
2893          * unable to restart the database at all!  (This scenario actually
2894          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2895          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2896          * the only time we can reach here during recovery is while flushing the
2897          * end-of-recovery checkpoint record, and we don't expect that to have a
2898          * bad LSN.
2899          *
2900          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2901          * since xact.c calls this routine inside a critical section.  However,
2902          * calls from bufmgr.c are not within critical sections and so we will not
2903          * force a restart for a bad LSN on a data page.
2904          */
2905         if (LogwrtResult.Flush < record)
2906                 elog(ERROR,
2907                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2908                          (uint32) (record >> 32), (uint32) record,
2909                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2910 }
2911
2912 /*
2913  * Write & flush xlog, but without specifying exactly where to.
2914  *
2915  * We normally write only completed blocks; but if there is nothing to do on
2916  * that basis, we check for unwritten async commits in the current incomplete
2917  * block, and write through the latest one of those.  Thus, if async commits
2918  * are not being used, we will write complete blocks only.
2919  *
2920  * If, based on the above, there's anything to write we do so immediately. But
2921  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2922  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2923  * more than wal_writer_flush_after unflushed blocks.
2924  *
2925  * We can guarantee that async commits reach disk after at most three
2926  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2927  * to write "flexibly", meaning it can stop at the end of the buffer ring;
2928  * this makes a difference only with very high load or long wal_writer_delay,
2929  * but imposes one extra cycle for the worst case for async commits.)
2930  *
2931  * This routine is invoked periodically by the background walwriter process.
2932  *
2933  * Returns TRUE if there was any work to do, even if we skipped flushing due
2934  * to wal_writer_delay/wal_writer_flush_after.
2935  */
2936 bool
2937 XLogBackgroundFlush(void)
2938 {
2939         XLogwrtRqst WriteRqst;
2940         bool            flexible = true;
2941         static TimestampTz lastflush;
2942         TimestampTz now;
2943         int                     flushbytes;
2944
2945         /* XLOG doesn't need flushing during recovery */
2946         if (RecoveryInProgress())
2947                 return false;
2948
2949         /* read LogwrtResult and update local state */
2950         SpinLockAcquire(&XLogCtl->info_lck);
2951         LogwrtResult = XLogCtl->LogwrtResult;
2952         WriteRqst = XLogCtl->LogwrtRqst;
2953         SpinLockRelease(&XLogCtl->info_lck);
2954
2955         /* back off to last completed page boundary */
2956         WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
2957
2958         /* if we have already flushed that far, consider async commit records */
2959         if (WriteRqst.Write <= LogwrtResult.Flush)
2960         {
2961                 SpinLockAcquire(&XLogCtl->info_lck);
2962                 WriteRqst.Write = XLogCtl->asyncXactLSN;
2963                 SpinLockRelease(&XLogCtl->info_lck);
2964                 flexible = false;               /* ensure it all gets written */
2965         }
2966
2967         /*
2968          * If already known flushed, we're done. Just need to check if we are
2969          * holding an open file handle to a logfile that's no longer in use,
2970          * preventing the file from being deleted.
2971          */
2972         if (WriteRqst.Write <= LogwrtResult.Flush)
2973         {
2974                 if (openLogFile >= 0)
2975                 {
2976                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2977                         {
2978                                 XLogFileClose();
2979                         }
2980                 }
2981                 return false;
2982         }
2983
2984         /*
2985          * Determine how far to flush WAL, based on the wal_writer_delay and
2986          * wal_writer_flush_after GUCs.
2987          */
2988         now = GetCurrentTimestamp();
2989         flushbytes =
2990                 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2991
2992         if (WalWriterFlushAfter == 0 || lastflush == 0)
2993         {
2994                 /* first call, or block based limits disabled */
2995                 WriteRqst.Flush = WriteRqst.Write;
2996                 lastflush = now;
2997         }
2998         else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
2999         {
3000                 /*
3001                  * Flush the writes at least every WalWriteDelay ms. This is important
3002                  * to bound the amount of time it takes for an asynchronous commit to
3003                  * hit disk.
3004                  */
3005                 WriteRqst.Flush = WriteRqst.Write;
3006                 lastflush = now;
3007         }
3008         else if (flushbytes >= WalWriterFlushAfter)
3009         {
3010                 /* exceeded wal_writer_flush_after blocks, flush */
3011                 WriteRqst.Flush = WriteRqst.Write;
3012                 lastflush = now;
3013         }
3014         else
3015         {
3016                 /* no flushing, this time round */
3017                 WriteRqst.Flush = 0;
3018         }
3019
3020 #ifdef WAL_DEBUG
3021         if (XLOG_DEBUG)
3022                 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3023                          (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3024                          (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3025                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3026                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3027 #endif
3028
3029         START_CRIT_SECTION();
3030
3031         /* now wait for any in-progress insertions to finish and get write lock */
3032         WaitXLogInsertionsToFinish(WriteRqst.Write);
3033         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3034         LogwrtResult = XLogCtl->LogwrtResult;
3035         if (WriteRqst.Write > LogwrtResult.Write ||
3036                 WriteRqst.Flush > LogwrtResult.Flush)
3037         {
3038                 XLogWrite(WriteRqst, flexible);
3039         }
3040         LWLockRelease(WALWriteLock);
3041
3042         END_CRIT_SECTION();
3043
3044         /* wake up walsenders now that we've released heavily contended locks */
3045         WalSndWakeupProcessRequests();
3046
3047         /*
3048          * Great, done. To take some work off the critical path, try to initialize
3049          * as many of the no-longer-needed WAL buffers for future use as we can.
3050          */
3051         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3052
3053         /*
3054          * If we determined that we need to write data, but somebody else
3055          * wrote/flushed already, it should be considered as being active, to
3056          * avoid hibernating too early.
3057          */
3058         return true;
3059 }
3060
3061 /*
3062  * Test whether XLOG data has been flushed up to (at least) the given position.
3063  *
3064  * Returns true if a flush is still needed.  (It may be that someone else
3065  * is already in process of flushing that far, however.)
3066  */
3067 bool
3068 XLogNeedsFlush(XLogRecPtr record)
3069 {
3070         /*
3071          * During recovery, we don't flush WAL but update minRecoveryPoint
3072          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3073          * would need to be updated.
3074          */
3075         if (RecoveryInProgress())
3076         {
3077                 /* Quick exit if already known updated */
3078                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3079                         return false;
3080
3081                 /*
3082                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3083                  * just return a conservative guess.
3084                  */
3085                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3086                         return true;
3087                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3088                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3089                 LWLockRelease(ControlFileLock);
3090
3091                 /*
3092                  * An invalid minRecoveryPoint means that we need to recover all the
3093                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3094                  * file's value in that case, so we can short-circuit future checks
3095                  * here too.
3096                  */
3097                 if (minRecoveryPoint == 0)
3098                         updateMinRecoveryPoint = false;
3099
3100                 /* check again */
3101                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3102                         return false;
3103                 else
3104                         return true;
3105         }
3106
3107         /* Quick exit if already known flushed */
3108         if (record <= LogwrtResult.Flush)
3109                 return false;
3110
3111         /* read LogwrtResult and update local state */
3112         SpinLockAcquire(&XLogCtl->info_lck);
3113         LogwrtResult = XLogCtl->LogwrtResult;
3114         SpinLockRelease(&XLogCtl->info_lck);
3115
3116         /* check again */
3117         if (record <= LogwrtResult.Flush)
3118                 return false;
3119
3120         return true;
3121 }
3122
3123 /*
3124  * Create a new XLOG file segment, or open a pre-existing one.
3125  *
3126  * log, seg: identify segment to be created/opened.
3127  *
3128  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3129  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3130  * file was used.
3131  *
3132  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3133  * place.  This should be TRUE except during bootstrap log creation.  The
3134  * caller must *not* hold the lock at call.
3135  *
3136  * Returns FD of opened file.
3137  *
3138  * Note: errors here are ERROR not PANIC because we might or might not be
3139  * inside a critical section (eg, during checkpoint there is no reason to
3140  * take down the system on failure).  They will promote to PANIC if we are
3141  * in a critical section.
3142  */
3143 int
3144 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3145 {
3146         char            path[MAXPGPATH];
3147         char            tmppath[MAXPGPATH];
3148         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
3149         char       *zbuffer;
3150         XLogSegNo       installed_segno;
3151         XLogSegNo       max_segno;
3152         int                     fd;
3153         int                     nbytes;
3154
3155         XLogFilePath(path, ThisTimeLineID, logsegno);
3156
3157         /*
3158          * Try to use existent file (checkpoint maker may have created it already)
3159          */
3160         if (*use_existent)
3161         {
3162                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3163                                                    S_IRUSR | S_IWUSR);
3164                 if (fd < 0)
3165                 {
3166                         if (errno != ENOENT)
3167                                 ereport(ERROR,
3168                                                 (errcode_for_file_access(),
3169                                                  errmsg("could not open file \"%s\": %m", path)));
3170                 }
3171                 else
3172                         return fd;
3173         }
3174
3175         /*
3176          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3177          * another process is doing the same thing.  If so, we will end up
3178          * pre-creating an extra log segment.  That seems OK, and better than
3179          * holding the lock throughout this lengthy process.
3180          */
3181         elog(DEBUG2, "creating and filling new WAL file");
3182
3183         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3184
3185         unlink(tmppath);
3186
3187         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3188         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3189                                            S_IRUSR | S_IWUSR);
3190         if (fd < 0)
3191                 ereport(ERROR,
3192                                 (errcode_for_file_access(),
3193                                  errmsg("could not create file \"%s\": %m", tmppath)));
3194
3195         /*
3196          * Zero-fill the file.  We have to do this the hard way to ensure that all
3197          * the file space has really been allocated --- on platforms that allow
3198          * "holes" in files, just seeking to the end doesn't allocate intermediate
3199          * space.  This way, we know that we have all the space and (after the
3200          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3201          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3202          * log file.
3203          *
3204          * Note: ensure the buffer is reasonably well-aligned; this may save a few
3205          * cycles transferring data to the kernel.
3206          */
3207         zbuffer = (char *) MAXALIGN(zbuffer_raw);
3208         memset(zbuffer, 0, XLOG_BLCKSZ);
3209         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3210         {
3211                 errno = 0;
3212                 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3213                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3214                 {
3215                         int                     save_errno = errno;
3216
3217                         /*
3218                          * If we fail to make the file, delete it to release disk space
3219                          */
3220                         unlink(tmppath);
3221
3222                         close(fd);
3223
3224                         /* if write didn't set errno, assume problem is no disk space */
3225                         errno = save_errno ? save_errno : ENOSPC;
3226
3227                         ereport(ERROR,
3228                                         (errcode_for_file_access(),
3229                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3230                 }
3231                 pgstat_report_wait_end();
3232         }
3233
3234         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3235         if (pg_fsync(fd) != 0)
3236         {
3237                 close(fd);
3238                 ereport(ERROR,
3239                                 (errcode_for_file_access(),
3240                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3241         }
3242         pgstat_report_wait_end();
3243
3244         if (close(fd))
3245                 ereport(ERROR,
3246                                 (errcode_for_file_access(),
3247                                  errmsg("could not close file \"%s\": %m", tmppath)));
3248
3249         /*
3250          * Now move the segment into place with its final name.
3251          *
3252          * If caller didn't want to use a pre-existing file, get rid of any
3253          * pre-existing file.  Otherwise, cope with possibility that someone else
3254          * has created the file while we were filling ours: if so, use ours to
3255          * pre-create a future log segment.
3256          */
3257         installed_segno = logsegno;
3258
3259         /*
3260          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3261          * that was a constant, but that was always a bit dubious: normally, at a
3262          * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3263          * here, it was the offset from the insert location. We can't do the
3264          * normal XLOGfileslop calculation here because we don't have access to
3265          * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3266          * CheckPointSegments.
3267          */
3268         max_segno = logsegno + CheckPointSegments;
3269         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3270                                                                 *use_existent, max_segno,
3271                                                                 use_lock))
3272         {
3273                 /*
3274                  * No need for any more future segments, or InstallXLogFileSegment()
3275                  * failed to rename the file into place. If the rename failed, opening
3276                  * the file below will fail.
3277                  */
3278                 unlink(tmppath);
3279         }
3280
3281         /* Set flag to tell caller there was no existent file */
3282         *use_existent = false;
3283
3284         /* Now open original target segment (might not be file I just made) */
3285         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3286                                            S_IRUSR | S_IWUSR);
3287         if (fd < 0)
3288                 ereport(ERROR,
3289                                 (errcode_for_file_access(),
3290                                  errmsg("could not open file \"%s\": %m", path)));
3291
3292         elog(DEBUG2, "done creating and filling new WAL file");
3293
3294         return fd;
3295 }
3296
3297 /*
3298  * Create a new XLOG file segment by copying a pre-existing one.
3299  *
3300  * destsegno: identify segment to be created.
3301  *
3302  * srcTLI, srcsegno: identify segment to be copied (could be from
3303  *              a different timeline)
3304  *
3305  * upto: how much of the source file to copy (the rest is filled with
3306  *              zeros)
3307  *
3308  * Currently this is only used during recovery, and so there are no locking
3309  * considerations.  But we should be just as tense as XLogFileInit to avoid
3310  * emplacing a bogus file.
3311  */
3312 static void
3313 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3314                          int upto)
3315 {
3316         char            path[MAXPGPATH];
3317         char            tmppath[MAXPGPATH];
3318         char            buffer[XLOG_BLCKSZ];
3319         int                     srcfd;
3320         int                     fd;
3321         int                     nbytes;
3322
3323         /*
3324          * Open the source file
3325          */
3326         XLogFilePath(path, srcTLI, srcsegno);
3327         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3328         if (srcfd < 0)
3329                 ereport(ERROR,
3330                                 (errcode_for_file_access(),
3331                                  errmsg("could not open file \"%s\": %m", path)));
3332
3333         /*
3334          * Copy into a temp file name.
3335          */
3336         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3337
3338         unlink(tmppath);
3339
3340         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3341         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3342                                                    S_IRUSR | S_IWUSR);
3343         if (fd < 0)
3344                 ereport(ERROR,
3345                                 (errcode_for_file_access(),
3346                                  errmsg("could not create file \"%s\": %m", tmppath)));
3347
3348         /*
3349          * Do the data copying.
3350          */
3351         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3352         {
3353                 int                     nread;
3354
3355                 nread = upto - nbytes;
3356
3357                 /*
3358                  * The part that is not read from the source file is filled with
3359                  * zeros.
3360                  */
3361                 if (nread < sizeof(buffer))
3362                         memset(buffer, 0, sizeof(buffer));
3363
3364                 if (nread > 0)
3365                 {
3366                         if (nread > sizeof(buffer))
3367                                 nread = sizeof(buffer);
3368                         errno = 0;
3369                         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3370                         if (read(srcfd, buffer, nread) != nread)
3371                         {
3372                                 if (errno != 0)
3373                                         ereport(ERROR,
3374                                                         (errcode_for_file_access(),
3375                                                          errmsg("could not read file \"%s\": %m",
3376                                                                         path)));
3377                                 else
3378                                         ereport(ERROR,
3379                                                         (errmsg("not enough data in file \"%s\"",
3380                                                                         path)));
3381                         }
3382                         pgstat_report_wait_end();
3383                 }
3384                 errno = 0;
3385                 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3386                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3387                 {
3388                         int                     save_errno = errno;
3389
3390                         /*
3391                          * If we fail to make the file, delete it to release disk space
3392                          */
3393                         unlink(tmppath);
3394                         /* if write didn't set errno, assume problem is no disk space */
3395                         errno = save_errno ? save_errno : ENOSPC;
3396
3397                         ereport(ERROR,
3398                                         (errcode_for_file_access(),
3399                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3400                 }
3401                 pgstat_report_wait_end();
3402         }
3403
3404         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3405         if (pg_fsync(fd) != 0)
3406                 ereport(ERROR,
3407                                 (errcode_for_file_access(),
3408                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3409         pgstat_report_wait_end();
3410
3411         if (CloseTransientFile(fd))
3412                 ereport(ERROR,
3413                                 (errcode_for_file_access(),
3414                                  errmsg("could not close file \"%s\": %m", tmppath)));
3415
3416         CloseTransientFile(srcfd);
3417
3418         /*
3419          * Now move the segment into place with its final name.
3420          */
3421         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3422                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3423 }
3424
3425 /*
3426  * Install a new XLOG segment file as a current or future log segment.
3427  *
3428  * This is used both to install a newly-created segment (which has a temp
3429  * filename while it's being created) and to recycle an old segment.
3430  *
3431  * *segno: identify segment to install as (or first possible target).
3432  * When find_free is TRUE, this is modified on return to indicate the
3433  * actual installation location or last segment searched.
3434  *
3435  * tmppath: initial name of file to install.  It will be renamed into place.
3436  *
3437  * find_free: if TRUE, install the new segment at the first empty segno
3438  * number at or after the passed numbers.  If FALSE, install the new segment
3439  * exactly where specified, deleting any existing segment file there.
3440  *
3441  * max_segno: maximum segment number to install the new file as.  Fail if no
3442  * free slot is found between *segno and max_segno. (Ignored when find_free
3443  * is FALSE.)
3444  *
3445  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3446  * place.  This should be TRUE except during bootstrap log creation.  The
3447  * caller must *not* hold the lock at call.
3448  *
3449  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3450  * max_segno limit was exceeded, or an error occurred while renaming the
3451  * file into place.
3452  */
3453 static bool
3454 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3455                                            bool find_free, XLogSegNo max_segno,
3456                                            bool use_lock)
3457 {
3458         char            path[MAXPGPATH];
3459         struct stat stat_buf;
3460
3461         XLogFilePath(path, ThisTimeLineID, *segno);
3462
3463         /*
3464          * We want to be sure that only one process does this at a time.
3465          */
3466         if (use_lock)
3467                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3468
3469         if (!find_free)
3470         {
3471                 /* Force installation: get rid of any pre-existing segment file */
3472                 unlink(path);
3473         }
3474         else
3475         {
3476                 /* Find a free slot to put it in */
3477                 while (stat(path, &stat_buf) == 0)
3478                 {
3479                         if ((*segno) >= max_segno)
3480                         {
3481                                 /* Failed to find a free slot within specified range */
3482                                 if (use_lock)
3483                                         LWLockRelease(ControlFileLock);
3484                                 return false;
3485                         }
3486                         (*segno)++;
3487                         XLogFilePath(path, ThisTimeLineID, *segno);
3488                 }
3489         }
3490
3491         /*
3492          * Perform the rename using link if available, paranoidly trying to avoid
3493          * overwriting an existing file (there shouldn't be one).
3494          */
3495         if (durable_link_or_rename(tmppath, path, LOG) != 0)
3496         {
3497                 if (use_lock)
3498                         LWLockRelease(ControlFileLock);
3499                 /* durable_link_or_rename already emitted log message */
3500                 return false;
3501         }
3502
3503         if (use_lock)
3504                 LWLockRelease(ControlFileLock);
3505
3506         return true;
3507 }
3508
3509 /*
3510  * Open a pre-existing logfile segment for writing.
3511  */
3512 int
3513 XLogFileOpen(XLogSegNo segno)
3514 {
3515         char            path[MAXPGPATH];
3516         int                     fd;
3517
3518         XLogFilePath(path, ThisTimeLineID, segno);
3519
3520         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3521                                            S_IRUSR | S_IWUSR);
3522         if (fd < 0)
3523                 ereport(PANIC,
3524                                 (errcode_for_file_access(),
3525                         errmsg("could not open transaction log file \"%s\": %m", path)));
3526
3527         return fd;
3528 }
3529
3530 /*
3531  * Open a logfile segment for reading (during recovery).
3532  *
3533  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3534  * Otherwise, it's assumed to be already available in pg_wal.
3535  */
3536 static int
3537 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3538                          int source, bool notfoundOk)
3539 {
3540         char            xlogfname[MAXFNAMELEN];
3541         char            activitymsg[MAXFNAMELEN + 16];
3542         char            path[MAXPGPATH];
3543         int                     fd;
3544
3545         XLogFileName(xlogfname, tli, segno);
3546
3547         switch (source)
3548         {
3549                 case XLOG_FROM_ARCHIVE:
3550                         /* Report recovery progress in PS display */
3551                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3552                                          xlogfname);
3553                         set_ps_display(activitymsg, false);
3554
3555                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3556                                                                                                           "RECOVERYXLOG",
3557                                                                                                           XLogSegSize,
3558                                                                                                           InRedo);
3559                         if (!restoredFromArchive)
3560                                 return -1;
3561                         break;
3562
3563                 case XLOG_FROM_PG_WAL:
3564                 case XLOG_FROM_STREAM:
3565                         XLogFilePath(path, tli, segno);
3566                         restoredFromArchive = false;
3567                         break;
3568
3569                 default:
3570                         elog(ERROR, "invalid XLogFileRead source %d", source);
3571         }
3572
3573         /*
3574          * If the segment was fetched from archival storage, replace the existing
3575          * xlog segment (if any) with the archival version.
3576          */
3577         if (source == XLOG_FROM_ARCHIVE)
3578         {
3579                 KeepFileRestoredFromArchive(path, xlogfname);
3580
3581                 /*
3582                  * Set path to point at the new file in pg_wal.
3583                  */
3584                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3585         }
3586
3587         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3588         if (fd >= 0)
3589         {
3590                 /* Success! */
3591                 curFileTLI = tli;
3592
3593                 /* Report recovery progress in PS display */
3594                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3595                                  xlogfname);
3596                 set_ps_display(activitymsg, false);
3597
3598                 /* Track source of data in assorted state variables */
3599                 readSource = source;
3600                 XLogReceiptSource = source;
3601                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3602                 if (source != XLOG_FROM_STREAM)
3603                         XLogReceiptTime = GetCurrentTimestamp();
3604
3605                 return fd;
3606         }
3607         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3608                 ereport(PANIC,
3609                                 (errcode_for_file_access(),
3610                                  errmsg("could not open file \"%s\": %m", path)));
3611         return -1;
3612 }
3613
3614 /*
3615  * Open a logfile segment for reading (during recovery).
3616  *
3617  * This version searches for the segment with any TLI listed in expectedTLEs.
3618  */
3619 static int
3620 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3621 {
3622         char            path[MAXPGPATH];
3623         ListCell   *cell;
3624         int                     fd;
3625         List       *tles;
3626
3627         /*
3628          * Loop looking for a suitable timeline ID: we might need to read any of
3629          * the timelines listed in expectedTLEs.
3630          *
3631          * We expect curFileTLI on entry to be the TLI of the preceding file in
3632          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3633          * to go backwards; this prevents us from picking up the wrong file when a
3634          * parent timeline extends to higher segment numbers than the child we
3635          * want to read.
3636          *
3637          * If we haven't read the timeline history file yet, read it now, so that
3638          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3639          * however, unless we actually find a valid segment.  That way if there is
3640          * neither a timeline history file nor a WAL segment in the archive, and
3641          * streaming replication is set up, we'll read the timeline history file
3642          * streamed from the master when we start streaming, instead of recovering
3643          * with a dummy history generated here.
3644          */
3645         if (expectedTLEs)
3646                 tles = expectedTLEs;
3647         else
3648                 tles = readTimeLineHistory(recoveryTargetTLI);
3649
3650         foreach(cell, tles)
3651         {
3652                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3653
3654                 if (tli < curFileTLI)
3655                         break;                          /* don't bother looking at too-old TLIs */
3656
3657                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3658                 {
3659                         fd = XLogFileRead(segno, emode, tli,
3660                                                           XLOG_FROM_ARCHIVE, true);
3661                         if (fd != -1)
3662                         {
3663                                 elog(DEBUG1, "got WAL segment from archive");
3664                                 if (!expectedTLEs)
3665                                         expectedTLEs = tles;
3666                                 return fd;
3667                         }
3668                 }
3669
3670                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3671                 {
3672                         fd = XLogFileRead(segno, emode, tli,
3673                                                           XLOG_FROM_PG_WAL, true);
3674                         if (fd != -1)
3675                         {
3676                                 if (!expectedTLEs)
3677                                         expectedTLEs = tles;
3678                                 return fd;
3679                         }
3680                 }
3681         }
3682
3683         /* Couldn't find it.  For simplicity, complain about front timeline */
3684         XLogFilePath(path, recoveryTargetTLI, segno);
3685         errno = ENOENT;
3686         ereport(emode,
3687                         (errcode_for_file_access(),
3688                          errmsg("could not open file \"%s\": %m", path)));
3689         return -1;
3690 }
3691
3692 /*
3693  * Close the current logfile segment for writing.
3694  */
3695 static void
3696 XLogFileClose(void)
3697 {
3698         Assert(openLogFile >= 0);
3699
3700         /*
3701          * WAL segment files will not be re-read in normal operation, so we advise
3702          * the OS to release any cached pages.  But do not do so if WAL archiving
3703          * or streaming is active, because archiver and walsender process could
3704          * use the cache to read the WAL segment.
3705          */
3706 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3707         if (!XLogIsNeeded())
3708                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3709 #endif
3710
3711         if (close(openLogFile))
3712                 ereport(PANIC,
3713                                 (errcode_for_file_access(),
3714                                  errmsg("could not close log file %s: %m",
3715                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3716         openLogFile = -1;
3717 }
3718
3719 /*
3720  * Preallocate log files beyond the specified log endpoint.
3721  *
3722  * XXX this is currently extremely conservative, since it forces only one
3723  * future log segment to exist, and even that only if we are 75% done with
3724  * the current one.  This is only appropriate for very low-WAL-volume systems.
3725  * High-volume systems will be OK once they've built up a sufficient set of
3726  * recycled log segments, but the startup transient is likely to include
3727  * a lot of segment creations by foreground processes, which is not so good.
3728  */
3729 static void
3730 PreallocXlogFiles(XLogRecPtr endptr)
3731 {
3732         XLogSegNo       _logSegNo;
3733         int                     lf;
3734         bool            use_existent;
3735
3736         XLByteToPrevSeg(endptr, _logSegNo);
3737         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3738         {
3739                 _logSegNo++;
3740                 use_existent = true;
3741                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3742                 close(lf);
3743                 if (!use_existent)
3744                         CheckpointStats.ckpt_segs_added++;
3745         }
3746 }
3747
3748 /*
3749  * Throws an error if the given log segment has already been removed or
3750  * recycled. The caller should only pass a segment that it knows to have
3751  * existed while the server has been running, as this function always
3752  * succeeds if no WAL segments have been removed since startup.
3753  * 'tli' is only used in the error message.
3754  */
3755 void
3756 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3757 {
3758         XLogSegNo       lastRemovedSegNo;
3759
3760         SpinLockAcquire(&XLogCtl->info_lck);
3761         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3762         SpinLockRelease(&XLogCtl->info_lck);
3763
3764         if (segno <= lastRemovedSegNo)
3765         {
3766                 char            filename[MAXFNAMELEN];
3767
3768                 XLogFileName(filename, tli, segno);
3769                 ereport(ERROR,
3770                                 (errcode_for_file_access(),
3771                                  errmsg("requested WAL segment %s has already been removed",
3772                                                 filename)));
3773         }
3774 }
3775
3776 /*
3777  * Return the last WAL segment removed, or 0 if no segment has been removed
3778  * since startup.
3779  *
3780  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3781  * with that.
3782  */
3783 XLogSegNo
3784 XLogGetLastRemovedSegno(void)
3785 {
3786         XLogSegNo       lastRemovedSegNo;
3787
3788         SpinLockAcquire(&XLogCtl->info_lck);
3789         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3790         SpinLockRelease(&XLogCtl->info_lck);
3791
3792         return lastRemovedSegNo;
3793 }
3794
3795 /*
3796  * Update the last removed segno pointer in shared memory, to reflect
3797  * that the given XLOG file has been removed.
3798  */
3799 static void
3800 UpdateLastRemovedPtr(char *filename)
3801 {
3802         uint32          tli;
3803         XLogSegNo       segno;
3804
3805         XLogFromFileName(filename, &tli, &segno);
3806
3807         SpinLockAcquire(&XLogCtl->info_lck);
3808         if (segno > XLogCtl->lastRemovedSegNo)
3809                 XLogCtl->lastRemovedSegNo = segno;
3810         SpinLockRelease(&XLogCtl->info_lck);
3811 }
3812
3813 /*
3814  * Recycle or remove all log files older or equal to passed segno.
3815  *
3816  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3817  * redo pointer of the previous checkpoint. These are used to determine
3818  * whether we want to recycle rather than delete no-longer-wanted log files.
3819  */
3820 static void
3821 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3822 {
3823         DIR                *xldir;
3824         struct dirent *xlde;
3825         char            lastoff[MAXFNAMELEN];
3826
3827         xldir = AllocateDir(XLOGDIR);
3828         if (xldir == NULL)
3829                 ereport(ERROR,
3830                                 (errcode_for_file_access(),
3831                                  errmsg("could not open transaction log directory \"%s\": %m",
3832                                                 XLOGDIR)));
3833
3834         /*
3835          * Construct a filename of the last segment to be kept. The timeline ID
3836          * doesn't matter, we ignore that in the comparison. (During recovery,
3837          * ThisTimeLineID isn't set, so we can't use that.)
3838          */
3839         XLogFileName(lastoff, 0, segno);
3840
3841         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3842                  lastoff);
3843
3844         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3845         {
3846                 /* Ignore files that are not XLOG segments */
3847                 if (!IsXLogFileName(xlde->d_name) &&
3848                         !IsPartialXLogFileName(xlde->d_name))
3849                         continue;
3850
3851                 /*
3852                  * We ignore the timeline part of the XLOG segment identifiers in
3853                  * deciding whether a segment is still needed.  This ensures that we
3854                  * won't prematurely remove a segment from a parent timeline. We could
3855                  * probably be a little more proactive about removing segments of
3856                  * non-parent timelines, but that would be a whole lot more
3857                  * complicated.
3858                  *
3859                  * We use the alphanumeric sorting property of the filenames to decide
3860                  * which ones are earlier than the lastoff segment.
3861                  */
3862                 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3863                 {
3864                         if (XLogArchiveCheckDone(xlde->d_name))
3865                         {
3866                                 /* Update the last removed location in shared memory first */
3867                                 UpdateLastRemovedPtr(xlde->d_name);
3868
3869                                 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
3870                         }
3871                 }
3872         }
3873
3874         FreeDir(xldir);
3875 }
3876
3877 /*
3878  * Remove WAL files that are not part of the given timeline's history.
3879  *
3880  * This is called during recovery, whenever we switch to follow a new
3881  * timeline, and at the end of recovery when we create a new timeline. We
3882  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3883  * might be leftover pre-allocated or recycled WAL segments on the old timeline
3884  * that we haven't used yet, and contain garbage. If we just leave them in
3885  * pg_wal, they will eventually be archived, and we can't let that happen.
3886  * Files that belong to our timeline history are valid, because we have
3887  * successfully replayed them, but from others we can't be sure.
3888  *
3889  * 'switchpoint' is the current point in WAL where we switch to new timeline,
3890  * and 'newTLI' is the new timeline we switch to.
3891  */
3892 static void
3893 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3894 {
3895         DIR                *xldir;
3896         struct dirent *xlde;
3897         char            switchseg[MAXFNAMELEN];
3898         XLogSegNo       endLogSegNo;
3899
3900         XLByteToPrevSeg(switchpoint, endLogSegNo);
3901
3902         xldir = AllocateDir(XLOGDIR);
3903         if (xldir == NULL)
3904                 ereport(ERROR,
3905                                 (errcode_for_file_access(),
3906                                  errmsg("could not open transaction log directory \"%s\": %m",
3907                                                 XLOGDIR)));
3908
3909         /*
3910          * Construct a filename of the last segment to be kept.
3911          */
3912         XLogFileName(switchseg, newTLI, endLogSegNo);
3913
3914         elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3915                  switchseg);
3916
3917         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3918         {
3919                 /* Ignore files that are not XLOG segments */
3920                 if (!IsXLogFileName(xlde->d_name))
3921                         continue;
3922
3923                 /*
3924                  * Remove files that are on a timeline older than the new one we're
3925                  * switching to, but with a segment number >= the first segment on the
3926                  * new timeline.
3927                  */
3928                 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
3929                         strcmp(xlde->d_name + 8, switchseg + 8) > 0)
3930                 {
3931                         /*
3932                          * If the file has already been marked as .ready, however, don't
3933                          * remove it yet. It should be OK to remove it - files that are
3934                          * not part of our timeline history are not required for recovery
3935                          * - but seems safer to let them be archived and removed later.
3936                          */
3937                         if (!XLogArchiveIsReady(xlde->d_name))
3938                                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
3939                 }
3940         }
3941
3942         FreeDir(xldir);
3943 }
3944
3945 /*
3946  * Recycle or remove a log file that's no longer needed.
3947  *
3948  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3949  * redo pointer of the previous checkpoint. These are used to determine
3950  * whether we want to recycle rather than delete no-longer-wanted log files.
3951  * If PriorRedoRecPtr is not known, pass invalid, and the function will
3952  * recycle, somewhat arbitrarily, 10 future segments.
3953  */
3954 static void
3955 RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3956 {
3957         char            path[MAXPGPATH];
3958 #ifdef WIN32
3959         char            newpath[MAXPGPATH];
3960 #endif
3961         struct stat statbuf;
3962         XLogSegNo       endlogSegNo;
3963         XLogSegNo       recycleSegNo;
3964
3965         /*
3966          * Initialize info about where to try to recycle to.
3967          */
3968         XLByteToPrevSeg(endptr, endlogSegNo);
3969         if (PriorRedoPtr == InvalidXLogRecPtr)
3970                 recycleSegNo = endlogSegNo + 10;
3971         else
3972                 recycleSegNo = XLOGfileslop(PriorRedoPtr);
3973
3974         snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
3975
3976         /*
3977          * Before deleting the file, see if it can be recycled as a future log
3978          * segment. Only recycle normal files, pg_standby for example can create
3979          * symbolic links pointing to a separate archive directory.
3980          */
3981         if (endlogSegNo <= recycleSegNo &&
3982                 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3983                 InstallXLogFileSegment(&endlogSegNo, path,
3984                                                            true, recycleSegNo, true))
3985         {
3986                 ereport(DEBUG2,
3987                                 (errmsg("recycled transaction log file \"%s\"",
3988                                                 segname)));
3989                 CheckpointStats.ckpt_segs_recycled++;
3990                 /* Needn't recheck that slot on future iterations */
3991                 endlogSegNo++;
3992         }
3993         else
3994         {
3995                 /* No need for any more future segments... */
3996                 int                     rc;
3997
3998                 ereport(DEBUG2,
3999                                 (errmsg("removing transaction log file \"%s\"",
4000                                                 segname)));
4001
4002 #ifdef WIN32
4003
4004                 /*
4005                  * On Windows, if another process (e.g another backend) holds the file
4006                  * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4007                  * will still show up in directory listing until the last handle is
4008                  * closed. To avoid confusing the lingering deleted file for a live
4009                  * WAL file that needs to be archived, rename it before deleting it.
4010                  *
4011                  * If another process holds the file open without FILE_SHARE_DELETE
4012                  * flag, rename will fail. We'll try again at the next checkpoint.
4013                  */
4014                 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4015                 if (rename(path, newpath) != 0)
4016                 {
4017                         ereport(LOG,
4018                                         (errcode_for_file_access(),
4019                            errmsg("could not rename old transaction log file \"%s\": %m",
4020                                           path)));
4021                         return;
4022                 }
4023                 rc = unlink(newpath);
4024 #else
4025                 rc = unlink(path);
4026 #endif
4027                 if (rc != 0)
4028                 {
4029                         ereport(LOG,
4030                                         (errcode_for_file_access(),
4031                            errmsg("could not remove old transaction log file \"%s\": %m",
4032                                           path)));
4033                         return;
4034                 }
4035                 CheckpointStats.ckpt_segs_removed++;
4036         }
4037
4038         XLogArchiveCleanup(segname);
4039 }
4040
4041 /*
4042  * Verify whether pg_wal and pg_wal/archive_status exist.
4043  * If the latter does not exist, recreate it.
4044  *
4045  * It is not the goal of this function to verify the contents of these
4046  * directories, but to help in cases where someone has performed a cluster
4047  * copy for PITR purposes but omitted pg_wal from the copy.
4048  *
4049  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4050  * policy decision was made not to.  It is fairly common for pg_wal to be
4051  * a symlink, and if that was the DBA's intent then automatically making a
4052  * plain directory would result in degraded performance with no notice.
4053  */
4054 static void
4055 ValidateXLOGDirectoryStructure(void)
4056 {
4057         char            path[MAXPGPATH];
4058         struct stat stat_buf;
4059
4060         /* Check for pg_wal; if it doesn't exist, error out */
4061         if (stat(XLOGDIR, &stat_buf) != 0 ||
4062                 !S_ISDIR(stat_buf.st_mode))
4063                 ereport(FATAL,
4064                                 (errmsg("required WAL directory \"%s\" does not exist",
4065                                                 XLOGDIR)));
4066
4067         /* Check for archive_status */
4068         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4069         if (stat(path, &stat_buf) == 0)
4070         {
4071                 /* Check for weird cases where it exists but isn't a directory */
4072                 if (!S_ISDIR(stat_buf.st_mode))
4073                         ereport(FATAL,
4074                                         (errmsg("required WAL directory \"%s\" does not exist",
4075                                                         path)));
4076         }
4077         else
4078         {
4079                 ereport(LOG,
4080                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4081                 if (mkdir(path, S_IRWXU) < 0)
4082                         ereport(FATAL,
4083                                         (errmsg("could not create missing directory \"%s\": %m",
4084                                                         path)));
4085         }
4086 }
4087
4088 /*
4089  * Remove previous backup history files.  This also retries creation of
4090  * .ready files for any backup history files for which XLogArchiveNotify
4091  * failed earlier.
4092  */
4093 static void
4094 CleanupBackupHistory(void)
4095 {
4096         DIR                *xldir;
4097         struct dirent *xlde;
4098         char            path[MAXPGPATH];
4099
4100         xldir = AllocateDir(XLOGDIR);
4101         if (xldir == NULL)
4102                 ereport(ERROR,
4103                                 (errcode_for_file_access(),
4104                                  errmsg("could not open transaction log directory \"%s\": %m",
4105                                                 XLOGDIR)));
4106
4107         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4108         {
4109                 if (IsBackupHistoryFileName(xlde->d_name))
4110                 {
4111                         if (XLogArchiveCheckDone(xlde->d_name))
4112                         {
4113                                 ereport(DEBUG2,
4114                                 (errmsg("removing transaction log backup history file \"%s\"",
4115                                                 xlde->d_name)));
4116                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4117                                 unlink(path);
4118                                 XLogArchiveCleanup(xlde->d_name);
4119                         }
4120                 }
4121         }
4122
4123         FreeDir(xldir);
4124 }
4125
4126 /*
4127  * Attempt to read an XLOG record.
4128  *
4129  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4130  * try to read a record just after the last one previously read.
4131  *
4132  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4133  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4134  * record is available.
4135  *
4136  * The record is copied into readRecordBuf, so that on successful return,
4137  * the returned record pointer always points there.
4138  */
4139 static XLogRecord *
4140 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4141                    bool fetching_ckpt)
4142 {
4143         XLogRecord *record;
4144         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4145
4146         /* Pass through parameters to XLogPageRead */
4147         private->fetching_ckpt = fetching_ckpt;
4148         private->emode = emode;
4149         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4150
4151         /* This is the first attempt to read this page. */
4152         lastSourceFailed = false;
4153
4154         for (;;)
4155         {
4156                 char       *errormsg;
4157
4158                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4159                 ReadRecPtr = xlogreader->ReadRecPtr;
4160                 EndRecPtr = xlogreader->EndRecPtr;
4161                 if (record == NULL)
4162                 {
4163                         if (readFile >= 0)
4164                         {
4165                                 close(readFile);
4166                                 readFile = -1;
4167                         }
4168
4169                         /*
4170                          * We only end up here without a message when XLogPageRead()
4171                          * failed - in that case we already logged something. In
4172                          * StandbyMode that only happens if we have been triggered, so we
4173                          * shouldn't loop anymore in that case.
4174                          */
4175                         if (errormsg)
4176                                 ereport(emode_for_corrupt_record(emode,
4177                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4178                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4179                 }
4180
4181                 /*
4182                  * Check page TLI is one of the expected values.
4183                  */
4184                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4185                 {
4186                         char            fname[MAXFNAMELEN];
4187                         XLogSegNo       segno;
4188                         int32           offset;
4189
4190                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4191                         offset = xlogreader->latestPagePtr % XLogSegSize;
4192                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4193                         ereport(emode_for_corrupt_record(emode,
4194                                                                                          RecPtr ? RecPtr : EndRecPtr),
4195                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4196                                         xlogreader->latestPageTLI,
4197                                         fname,
4198                                         offset)));
4199                         record = NULL;
4200                 }
4201
4202                 if (record)
4203                 {
4204                         /* Great, got a record */
4205                         return record;
4206                 }
4207                 else
4208                 {
4209                         /* No valid record available from this source */
4210                         lastSourceFailed = true;
4211
4212                         /*
4213                          * If archive recovery was requested, but we were still doing
4214                          * crash recovery, switch to archive recovery and retry using the
4215                          * offline archive. We have now replayed all the valid WAL in
4216                          * pg_wal, so we are presumably now consistent.
4217                          *
4218                          * We require that there's at least some valid WAL present in
4219                          * pg_wal, however (!fetch_ckpt). We could recover using the WAL
4220                          * from the archive, even if pg_wal is completely empty, but we'd
4221                          * have no idea how far we'd have to replay to reach consistency.
4222                          * So err on the safe side and give up.
4223                          */
4224                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4225                                 !fetching_ckpt)
4226                         {
4227                                 ereport(DEBUG1,
4228                                                 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4229                                 InArchiveRecovery = true;
4230                                 if (StandbyModeRequested)
4231                                         StandbyMode = true;
4232
4233                                 /* initialize minRecoveryPoint to this record */
4234                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4235                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4236                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4237                                 {
4238                                         ControlFile->minRecoveryPoint = EndRecPtr;
4239                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4240                                 }
4241                                 /* update local copy */
4242                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4243                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4244
4245                                 UpdateControlFile();
4246                                 LWLockRelease(ControlFileLock);
4247
4248                                 CheckRecoveryConsistency();
4249
4250                                 /*
4251                                  * Before we retry, reset lastSourceFailed and currentSource
4252                                  * so that we will check the archive next.
4253                                  */
4254                                 lastSourceFailed = false;
4255                                 currentSource = 0;
4256
4257                                 continue;
4258                         }
4259
4260                         /* In standby mode, loop back to retry. Otherwise, give up. */
4261                         if (StandbyMode && !CheckForStandbyTrigger())
4262                                 continue;
4263                         else
4264                                 return NULL;
4265                 }
4266         }
4267 }
4268
4269 /*
4270  * Scan for new timelines that might have appeared in the archive since we
4271  * started recovery.
4272  *
4273  * If there are any, the function changes recovery target TLI to the latest
4274  * one and returns 'true'.
4275  */
4276 static bool
4277 rescanLatestTimeLine(void)
4278 {
4279         List       *newExpectedTLEs;
4280         bool            found;
4281         ListCell   *cell;
4282         TimeLineID      newtarget;
4283         TimeLineID      oldtarget = recoveryTargetTLI;
4284         TimeLineHistoryEntry *currentTle = NULL;
4285
4286         newtarget = findNewestTimeLine(recoveryTargetTLI);
4287         if (newtarget == recoveryTargetTLI)
4288         {
4289                 /* No new timelines found */
4290                 return false;
4291         }
4292
4293         /*
4294          * Determine the list of expected TLIs for the new TLI
4295          */
4296
4297         newExpectedTLEs = readTimeLineHistory(newtarget);
4298
4299         /*
4300          * If the current timeline is not part of the history of the new timeline,
4301          * we cannot proceed to it.
4302          */
4303         found = false;
4304         foreach(cell, newExpectedTLEs)
4305         {
4306                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4307
4308                 if (currentTle->tli == recoveryTargetTLI)
4309                 {
4310                         found = true;
4311                         break;
4312                 }
4313         }
4314         if (!found)
4315         {
4316                 ereport(LOG,
4317                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4318                                                 newtarget,
4319                                                 ThisTimeLineID)));
4320                 return false;
4321         }
4322
4323         /*
4324          * The current timeline was found in the history file, but check that the
4325          * next timeline was forked off from it *after* the current recovery
4326          * location.
4327          */
4328         if (currentTle->end < EndRecPtr)
4329         {
4330                 ereport(LOG,
4331                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4332                                                 newtarget,
4333                                                 ThisTimeLineID,
4334                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4335                 return false;
4336         }
4337
4338         /* The new timeline history seems valid. Switch target */
4339         recoveryTargetTLI = newtarget;
4340         list_free_deep(expectedTLEs);
4341         expectedTLEs = newExpectedTLEs;
4342
4343         /*
4344          * As in StartupXLOG(), try to ensure we have all the history files
4345          * between the old target and new target in pg_wal.
4346          */
4347         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4348
4349         ereport(LOG,
4350                         (errmsg("new target timeline is %u",
4351                                         recoveryTargetTLI)));
4352
4353         return true;
4354 }
4355
4356 /*
4357  * I/O routines for pg_control
4358  *
4359  * *ControlFile is a buffer in shared memory that holds an image of the
4360  * contents of pg_control.  WriteControlFile() initializes pg_control
4361  * given a preloaded buffer, ReadControlFile() loads the buffer from
4362  * the pg_control file (during postmaster or standalone-backend startup),
4363  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4364  *
4365  * For simplicity, WriteControlFile() initializes the fields of pg_control
4366  * that are related to checking backend/database compatibility, and
4367  * ReadControlFile() verifies they are correct.  We could split out the
4368  * I/O and compatibility-check functions, but there seems no need currently.
4369  */
4370 static void
4371 WriteControlFile(void)
4372 {
4373         int                     fd;
4374         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4375
4376         /*
4377          * Initialize version and compatibility-check fields
4378          */
4379         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4380         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4381
4382         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4383         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4384
4385         ControlFile->blcksz = BLCKSZ;
4386         ControlFile->relseg_size = RELSEG_SIZE;
4387         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4388         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4389
4390         ControlFile->nameDataLen = NAMEDATALEN;
4391         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4392
4393         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4394         ControlFile->loblksize = LOBLKSIZE;
4395
4396         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4397         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4398
4399         /* Contents are protected with a CRC */
4400         INIT_CRC32C(ControlFile->crc);
4401         COMP_CRC32C(ControlFile->crc,
4402                                 (char *) ControlFile,
4403                                 offsetof(ControlFileData, crc));
4404         FIN_CRC32C(ControlFile->crc);
4405
4406         /*
4407          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4408          * excess over sizeof(ControlFileData).  This reduces the odds of
4409          * premature-EOF errors when reading pg_control.  We'll still fail when we
4410          * check the contents of the file, but hopefully with a more specific
4411          * error than "couldn't read pg_control".
4412          */
4413         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4414                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4415
4416         memset(buffer, 0, PG_CONTROL_SIZE);
4417         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4418
4419         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4420                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4421                                            S_IRUSR | S_IWUSR);
4422         if (fd < 0)
4423                 ereport(PANIC,
4424                                 (errcode_for_file_access(),
4425                                  errmsg("could not create control file \"%s\": %m",
4426                                                 XLOG_CONTROL_FILE)));
4427
4428         errno = 0;
4429         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4430         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4431         {
4432                 /* if write didn't set errno, assume problem is no disk space */
4433                 if (errno == 0)
4434                         errno = ENOSPC;
4435                 ereport(PANIC,
4436                                 (errcode_for_file_access(),
4437                                  errmsg("could not write to control file: %m")));
4438         }
4439         pgstat_report_wait_end();
4440
4441         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4442         if (pg_fsync(fd) != 0)
4443                 ereport(PANIC,
4444                                 (errcode_for_file_access(),
4445                                  errmsg("could not fsync control file: %m")));
4446         pgstat_report_wait_end();
4447
4448         if (close(fd))
4449                 ereport(PANIC,
4450                                 (errcode_for_file_access(),
4451                                  errmsg("could not close control file: %m")));
4452 }
4453
4454 static void
4455 ReadControlFile(void)
4456 {
4457         pg_crc32c       crc;
4458         int                     fd;
4459
4460         /*
4461          * Read data...
4462          */
4463         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4464                                            O_RDWR | PG_BINARY,
4465                                            S_IRUSR | S_IWUSR);
4466         if (fd < 0)
4467                 ereport(PANIC,
4468                                 (errcode_for_file_access(),
4469                                  errmsg("could not open control file \"%s\": %m",
4470                                                 XLOG_CONTROL_FILE)));
4471
4472         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4473         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4474                 ereport(PANIC,
4475                                 (errcode_for_file_access(),
4476                                  errmsg("could not read from control file: %m")));
4477         pgstat_report_wait_end();
4478
4479         close(fd);
4480
4481         /*
4482          * Check for expected pg_control format version.  If this is wrong, the
4483          * CRC check will likely fail because we'll be checking the wrong number
4484          * of bytes.  Complaining about wrong version will probably be more
4485          * enlightening than complaining about wrong CRC.
4486          */
4487
4488         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4489                 ereport(FATAL,
4490                                 (errmsg("database files are incompatible with server"),
4491                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4492                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4493                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4494                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4495                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4496
4497         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4498                 ereport(FATAL,
4499                                 (errmsg("database files are incompatible with server"),
4500                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4501                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4502                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4503                                  errhint("It looks like you need to initdb.")));
4504
4505         /* Now check the CRC. */
4506         INIT_CRC32C(crc);
4507         COMP_CRC32C(crc,
4508                                 (char *) ControlFile,
4509                                 offsetof(ControlFileData, crc));
4510         FIN_CRC32C(crc);
4511
4512         if (!EQ_CRC32C(crc, ControlFile->crc))
4513                 ereport(FATAL,
4514                                 (errmsg("incorrect checksum in control file")));
4515
4516         /*
4517          * Do compatibility checking immediately.  If the database isn't
4518          * compatible with the backend executable, we want to abort before we can
4519          * possibly do any damage.
4520          */
4521         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4522                 ereport(FATAL,
4523                                 (errmsg("database files are incompatible with server"),
4524                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4525                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4526                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4527                                  errhint("It looks like you need to initdb.")));
4528         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4529                 ereport(FATAL,
4530                                 (errmsg("database files are incompatible with server"),
4531                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4532                                          " but the server was compiled with MAXALIGN %d.",
4533                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4534                                  errhint("It looks like you need to initdb.")));
4535         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4536                 ereport(FATAL,
4537                                 (errmsg("database files are incompatible with server"),
4538                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4539                                  errhint("It looks like you need to initdb.")));
4540         if (ControlFile->blcksz != BLCKSZ)
4541                 ereport(FATAL,
4542                                 (errmsg("database files are incompatible with server"),
4543                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4544                                            " but the server was compiled with BLCKSZ %d.",
4545                                            ControlFile->blcksz, BLCKSZ),
4546                                  errhint("It looks like you need to recompile or initdb.")));
4547         if (ControlFile->relseg_size != RELSEG_SIZE)
4548                 ereport(FATAL,
4549                                 (errmsg("database files are incompatible with server"),
4550                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4551                                   " but the server was compiled with RELSEG_SIZE %d.",
4552                                   ControlFile->relseg_size, RELSEG_SIZE),
4553                                  errhint("It looks like you need to recompile or initdb.")));
4554         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4555                 ereport(FATAL,
4556                                 (errmsg("database files are incompatible with server"),
4557                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4558                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4559                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4560                                  errhint("It looks like you need to recompile or initdb.")));
4561         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4562                 ereport(FATAL,
4563                                 (errmsg("database files are incompatible with server"),
4564                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4565                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4566                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4567                                  errhint("It looks like you need to recompile or initdb.")));
4568         if (ControlFile->nameDataLen != NAMEDATALEN)
4569                 ereport(FATAL,
4570                                 (errmsg("database files are incompatible with server"),
4571                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4572                                   " but the server was compiled with NAMEDATALEN %d.",
4573                                   ControlFile->nameDataLen, NAMEDATALEN),
4574                                  errhint("It looks like you need to recompile or initdb.")));
4575         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4576                 ereport(FATAL,
4577                                 (errmsg("database files are incompatible with server"),
4578                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4579                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4580                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4581                                  errhint("It looks like you need to recompile or initdb.")));
4582         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4583                 ereport(FATAL,
4584                                 (errmsg("database files are incompatible with server"),
4585                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4586                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4587                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4588                                  errhint("It looks like you need to recompile or initdb.")));
4589         if (ControlFile->loblksize != LOBLKSIZE)
4590                 ereport(FATAL,
4591                                 (errmsg("database files are incompatible with server"),
4592                   errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4593                                         " but the server was compiled with LOBLKSIZE %d.",
4594                                         ControlFile->loblksize, (int) LOBLKSIZE),
4595                                  errhint("It looks like you need to recompile or initdb.")));
4596
4597 #ifdef USE_FLOAT4_BYVAL
4598         if (ControlFile->float4ByVal != true)
4599                 ereport(FATAL,
4600                                 (errmsg("database files are incompatible with server"),
4601                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4602                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4603                                  errhint("It looks like you need to recompile or initdb.")));
4604 #else
4605         if (ControlFile->float4ByVal != false)
4606                 ereport(FATAL,
4607                                 (errmsg("database files are incompatible with server"),
4608                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4609                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4610                                  errhint("It looks like you need to recompile or initdb.")));
4611 #endif
4612
4613 #ifdef USE_FLOAT8_BYVAL
4614         if (ControlFile->float8ByVal != true)
4615                 ereport(FATAL,
4616                                 (errmsg("database files are incompatible with server"),
4617                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4618                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4619                                  errhint("It looks like you need to recompile or initdb.")));
4620 #else
4621         if (ControlFile->float8ByVal != false)
4622                 ereport(FATAL,
4623                                 (errmsg("database files are incompatible with server"),
4624                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4625                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4626                                  errhint("It looks like you need to recompile or initdb.")));
4627 #endif
4628
4629         /* Make the initdb settings visible as GUC variables, too */
4630         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4631                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4632 }
4633
4634 void
4635 UpdateControlFile(void)
4636 {
4637         int                     fd;
4638
4639         INIT_CRC32C(ControlFile->crc);
4640         COMP_CRC32C(ControlFile->crc,
4641                                 (char *) ControlFile,
4642                                 offsetof(ControlFileData, crc));
4643         FIN_CRC32C(ControlFile->crc);
4644
4645         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4646                                            O_RDWR | PG_BINARY,
4647                                            S_IRUSR | S_IWUSR);
4648         if (fd < 0)
4649                 ereport(PANIC,
4650                                 (errcode_for_file_access(),
4651                                  errmsg("could not open control file \"%s\": %m",
4652                                                 XLOG_CONTROL_FILE)));
4653
4654         errno = 0;
4655         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
4656         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4657         {
4658                 /* if write didn't set errno, assume problem is no disk space */
4659                 if (errno == 0)
4660                         errno = ENOSPC;
4661                 ereport(PANIC,
4662                                 (errcode_for_file_access(),
4663                                  errmsg("could not write to control file: %m")));
4664         }
4665         pgstat_report_wait_end();
4666
4667         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
4668         if (pg_fsync(fd) != 0)
4669                 ereport(PANIC,
4670                                 (errcode_for_file_access(),
4671                                  errmsg("could not fsync control file: %m")));
4672         pgstat_report_wait_end();
4673
4674         if (close(fd))
4675                 ereport(PANIC,
4676                                 (errcode_for_file_access(),
4677                                  errmsg("could not close control file: %m")));
4678 }
4679
4680 /*
4681  * Returns the unique system identifier from control file.
4682  */
4683 uint64
4684 GetSystemIdentifier(void)
4685 {
4686         Assert(ControlFile != NULL);
4687         return ControlFile->system_identifier;
4688 }
4689
4690 /*
4691  * Returns the random nonce from control file.
4692  */
4693 char *
4694 GetMockAuthenticationNonce(void)
4695 {
4696         Assert(ControlFile != NULL);
4697         return ControlFile->mock_authentication_nonce;
4698 }
4699
4700 /*
4701  * Are checksums enabled for data pages?
4702  */
4703 bool
4704 DataChecksumsEnabled(void)
4705 {
4706         Assert(ControlFile != NULL);
4707         return (ControlFile->data_checksum_version > 0);
4708 }
4709
4710 /*
4711  * Returns a fake LSN for unlogged relations.
4712  *
4713  * Each call generates an LSN that is greater than any previous value
4714  * returned. The current counter value is saved and restored across clean
4715  * shutdowns, but like unlogged relations, does not survive a crash. This can
4716  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4717  * LSN-like increasing sequence of numbers without writing any WAL.
4718  */
4719 XLogRecPtr
4720 GetFakeLSNForUnloggedRel(void)
4721 {
4722         XLogRecPtr      nextUnloggedLSN;
4723
4724         /* increment the unloggedLSN counter, need SpinLock */
4725         SpinLockAcquire(&XLogCtl->ulsn_lck);
4726         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4727         SpinLockRelease(&XLogCtl->ulsn_lck);
4728
4729         return nextUnloggedLSN;
4730 }
4731
4732 /*
4733  * Auto-tune the number of XLOG buffers.
4734  *
4735  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4736  * a maximum of one XLOG segment (there is little reason to think that more
4737  * is helpful, at least so long as we force an fsync when switching log files)
4738  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4739  * 9.1, when auto-tuning was added).
4740  *
4741  * This should not be called until NBuffers has received its final value.
4742  */
4743 static int
4744 XLOGChooseNumBuffers(void)
4745 {
4746         int                     xbuffers;
4747
4748         xbuffers = NBuffers / 32;
4749         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4750                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4751         if (xbuffers < 8)
4752                 xbuffers = 8;
4753         return xbuffers;
4754 }
4755
4756 /*
4757  * GUC check_hook for wal_buffers
4758  */
4759 bool
4760 check_wal_buffers(int *newval, void **extra, GucSource source)
4761 {
4762         /*
4763          * -1 indicates a request for auto-tune.
4764          */
4765         if (*newval == -1)
4766         {
4767                 /*
4768                  * If we haven't yet changed the boot_val default of -1, just let it
4769                  * be.  We'll fix it when XLOGShmemSize is called.
4770                  */
4771                 if (XLOGbuffers == -1)
4772                         return true;
4773
4774                 /* Otherwise, substitute the auto-tune value */
4775                 *newval = XLOGChooseNumBuffers();
4776         }
4777
4778         /*
4779          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4780          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4781          * the case, we just silently treat such values as a request for the
4782          * minimum.  (We could throw an error instead, but that doesn't seem very
4783          * helpful.)
4784          */
4785         if (*newval < 4)
4786                 *newval = 4;
4787
4788         return true;
4789 }
4790
4791 /*
4792  * Initialization of shared memory for XLOG
4793  */
4794 Size
4795 XLOGShmemSize(void)
4796 {
4797         Size            size;
4798
4799         /*
4800          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4801          * This isn't an amazingly clean place to do this, but we must wait till
4802          * NBuffers has received its final value, and must do it before using the
4803          * value of XLOGbuffers to do anything important.
4804          */
4805         if (XLOGbuffers == -1)
4806         {
4807                 char            buf[32];
4808
4809                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4810                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4811         }
4812         Assert(XLOGbuffers > 0);
4813
4814         /* XLogCtl */
4815         size = sizeof(XLogCtlData);
4816
4817         /* WAL insertion locks, plus alignment */
4818         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4819         /* xlblocks array */
4820         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4821         /* extra alignment padding for XLOG I/O buffers */
4822         size = add_size(size, XLOG_BLCKSZ);
4823         /* and the buffers themselves */
4824         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4825
4826         /*
4827          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4828          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4829          * routine again below to compute the actual allocation size.
4830          */
4831
4832         return size;
4833 }
4834
4835 void
4836 XLOGShmemInit(void)
4837 {
4838         bool            foundCFile,
4839                                 foundXLog;
4840         char       *allocptr;
4841         int                     i;
4842
4843 #ifdef WAL_DEBUG
4844
4845         /*
4846          * Create a memory context for WAL debugging that's exempt from the normal
4847          * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4848          * an allocation fails, but wal_debug is not for production use anyway.
4849          */
4850         if (walDebugCxt == NULL)
4851         {
4852                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4853                                                                                         "WAL Debug",
4854                                                                                         ALLOCSET_DEFAULT_SIZES);
4855                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4856         }
4857 #endif
4858
4859         ControlFile = (ControlFileData *)
4860                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4861         XLogCtl = (XLogCtlData *)
4862                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4863
4864         if (foundCFile || foundXLog)
4865         {
4866                 /* both should be present or neither */
4867                 Assert(foundCFile && foundXLog);
4868
4869                 /* Initialize local copy of WALInsertLocks and register the tranche */
4870                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4871                 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
4872                                                           "wal_insert");
4873                 return;
4874         }
4875         memset(XLogCtl, 0, sizeof(XLogCtlData));
4876
4877         /*
4878          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4879          * multiple of the alignment for same, so no extra alignment padding is
4880          * needed here.
4881          */
4882         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4883         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4884         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4885         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4886
4887
4888         /* WAL insertion locks. Ensure they're aligned to the full padded size */
4889         allocptr += sizeof(WALInsertLockPadded) -
4890                 ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
4891         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4892                 (WALInsertLockPadded *) allocptr;
4893         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4894
4895         LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
4896         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4897         {
4898                 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
4899                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4900                 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
4901         }
4902
4903         /*
4904          * Align the start of the page buffers to a full xlog block size boundary.
4905          * This simplifies some calculations in XLOG insertion. It is also
4906          * required for O_DIRECT.
4907          */
4908         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4909         XLogCtl->pages = allocptr;
4910         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4911
4912         /*
4913          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4914          * in additional info.)
4915          */
4916         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4917         XLogCtl->SharedRecoveryInProgress = true;
4918         XLogCtl->SharedHotStandbyActive = false;
4919         XLogCtl->WalWriterSleeping = false;
4920
4921         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4922         SpinLockInit(&XLogCtl->info_lck);
4923         SpinLockInit(&XLogCtl->ulsn_lck);
4924         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4925
4926         /*
4927          * If we are not in bootstrap mode, pg_control should already exist. Read
4928          * and validate it immediately (see comments in ReadControlFile() for the
4929          * reasons why).
4930          */
4931         if (!IsBootstrapProcessingMode())
4932                 ReadControlFile();
4933 }
4934
4935 /*
4936  * This func must be called ONCE on system install.  It creates pg_control
4937  * and the initial XLOG segment.
4938  */
4939 void
4940 BootStrapXLOG(void)
4941 {
4942         CheckPoint      checkPoint;
4943         char       *buffer;
4944         XLogPageHeader page;
4945         XLogLongPageHeader longpage;
4946         XLogRecord *record;
4947         char       *recptr;
4948         bool            use_existent;
4949         uint64          sysidentifier;
4950         char            mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4951         struct timeval tv;
4952         pg_crc32c       crc;
4953
4954         /*
4955          * Select a hopefully-unique system identifier code for this installation.
4956          * We use the result of gettimeofday(), including the fractional seconds
4957          * field, as being about as unique as we can easily get.  (Think not to
4958          * use random(), since it hasn't been seeded and there's no portable way
4959          * to seed it other than the system clock value...)  The upper half of the
4960          * uint64 value is just the tv_sec part, while the lower half contains the
4961          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4962          * PID for a little extra uniqueness.  A person knowing this encoding can
4963          * determine the initialization time of the installation, which could
4964          * perhaps be useful sometimes.
4965          */
4966         gettimeofday(&tv, NULL);
4967         sysidentifier = ((uint64) tv.tv_sec) << 32;
4968         sysidentifier |= ((uint64) tv.tv_usec) << 12;
4969         sysidentifier |= getpid() & 0xFFF;
4970
4971         /*
4972          * Generate a random nonce. This is used for authentication requests
4973          * that will fail because the user does not exist. The nonce is used to
4974          * create a genuine-looking password challenge for the non-existent user,
4975          * in lieu of an actual stored password.
4976          */
4977         if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4978                 ereport(PANIC,
4979                         (errcode(ERRCODE_INTERNAL_ERROR),
4980                          errmsg("could not generation secret authorization token")));
4981
4982         /* First timeline ID is always 1 */
4983         ThisTimeLineID = 1;
4984
4985         /* page buffer must be aligned suitably for O_DIRECT */
4986         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4987         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4988         memset(page, 0, XLOG_BLCKSZ);
4989
4990         /*
4991          * Set up information for the initial checkpoint record
4992          *
4993          * The initial checkpoint record is written to the beginning of the WAL
4994          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4995          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4996          */
4997         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4998         checkPoint.ThisTimeLineID = ThisTimeLineID;
4999         checkPoint.PrevTimeLineID = ThisTimeLineID;
5000         checkPoint.fullPageWrites = fullPageWrites;
5001         checkPoint.nextXidEpoch = 0;
5002         checkPoint.nextXid = FirstNormalTransactionId;
5003         checkPoint.nextOid = FirstBootstrapObjectId;
5004         checkPoint.nextMulti = FirstMultiXactId;
5005         checkPoint.nextMultiOffset = 0;
5006         checkPoint.oldestXid = FirstNormalTransactionId;
5007         checkPoint.oldestXidDB = TemplateDbOid;
5008         checkPoint.oldestMulti = FirstMultiXactId;
5009         checkPoint.oldestMultiDB = TemplateDbOid;
5010         checkPoint.oldestCommitTsXid = InvalidTransactionId;
5011         checkPoint.newestCommitTsXid = InvalidTransactionId;
5012         checkPoint.time = (pg_time_t) time(NULL);
5013         checkPoint.oldestActiveXid = InvalidTransactionId;
5014
5015         ShmemVariableCache->nextXid = checkPoint.nextXid;
5016         ShmemVariableCache->nextOid = checkPoint.nextOid;
5017         ShmemVariableCache->oidCount = 0;
5018         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5019         AdvanceOldestClogXid(checkPoint.oldestXid);
5020         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5021         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5022         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5023
5024         /* Set up the XLOG page header */
5025         page->xlp_magic = XLOG_PAGE_MAGIC;
5026         page->xlp_info = XLP_LONG_HEADER;
5027         page->xlp_tli = ThisTimeLineID;
5028         page->xlp_pageaddr = XLogSegSize;
5029         longpage = (XLogLongPageHeader) page;
5030         longpage->xlp_sysid = sysidentifier;
5031         longpage->xlp_seg_size = XLogSegSize;
5032         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5033
5034         /* Insert the initial checkpoint record */
5035         recptr = ((char *) page + SizeOfXLogLongPHD);
5036         record = (XLogRecord *) recptr;
5037         record->xl_prev = 0;
5038         record->xl_xid = InvalidTransactionId;
5039         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5040         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5041         record->xl_rmid = RM_XLOG_ID;
5042         recptr += SizeOfXLogRecord;
5043         /* fill the XLogRecordDataHeaderShort struct */
5044         *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
5045         *(recptr++) = sizeof(checkPoint);
5046         memcpy(recptr, &checkPoint, sizeof(checkPoint));
5047         recptr += sizeof(checkPoint);
5048         Assert(recptr - (char *) record == record->xl_tot_len);
5049
5050         INIT_CRC32C(crc);
5051         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5052         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5053         FIN_CRC32C(crc);
5054         record->xl_crc = crc;
5055
5056         /* Create first XLOG segment file */
5057         use_existent = false;
5058         openLogFile = XLogFileInit(1, &use_existent, false);
5059
5060         /* Write the first page with the initial record */
5061         errno = 0;
5062         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5063         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5064         {
5065                 /* if write didn't set errno, assume problem is no disk space */
5066                 if (errno == 0)
5067                         errno = ENOSPC;
5068                 ereport(PANIC,
5069                                 (errcode_for_file_access(),
5070                           errmsg("could not write bootstrap transaction log file: %m")));
5071         }
5072         pgstat_report_wait_end();
5073
5074         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5075         if (pg_fsync(openLogFile) != 0)
5076                 ereport(PANIC,
5077                                 (errcode_for_file_access(),
5078                           errmsg("could not fsync bootstrap transaction log file: %m")));
5079         pgstat_report_wait_end();
5080
5081         if (close(openLogFile))
5082                 ereport(PANIC,
5083                                 (errcode_for_file_access(),
5084                           errmsg("could not close bootstrap transaction log file: %m")));
5085
5086         openLogFile = -1;
5087
5088         /* Now create pg_control */
5089
5090         memset(ControlFile, 0, sizeof(ControlFileData));
5091         /* Initialize pg_control status fields */
5092         ControlFile->system_identifier = sysidentifier;
5093         memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5094         ControlFile->state = DB_SHUTDOWNED;
5095         ControlFile->time = checkPoint.time;
5096         ControlFile->checkPoint = checkPoint.redo;
5097         ControlFile->checkPointCopy = checkPoint;
5098         ControlFile->unloggedLSN = 1;
5099
5100         /* Set important parameter values for use when replaying WAL */
5101         ControlFile->MaxConnections = MaxConnections;
5102         ControlFile->max_worker_processes = max_worker_processes;
5103         ControlFile->max_prepared_xacts = max_prepared_xacts;
5104         ControlFile->max_locks_per_xact = max_locks_per_xact;
5105         ControlFile->wal_level = wal_level;
5106         ControlFile->wal_log_hints = wal_log_hints;
5107         ControlFile->track_commit_timestamp = track_commit_timestamp;
5108         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5109
5110         /* some additional ControlFile fields are set in WriteControlFile() */
5111
5112         WriteControlFile();
5113
5114         /* Bootstrap the commit log, too */
5115         BootStrapCLOG();
5116         BootStrapCommitTs();
5117         BootStrapSUBTRANS();
5118         BootStrapMultiXact();
5119
5120         pfree(buffer);
5121 }
5122
5123 static char *
5124 str_time(pg_time_t tnow)
5125 {
5126         static char buf[128];
5127
5128         pg_strftime(buf, sizeof(buf),
5129                                 "%Y-%m-%d %H:%M:%S %Z",
5130                                 pg_localtime(&tnow, log_timezone));
5131
5132         return buf;
5133 }
5134
5135 /*
5136  * See if there is a recovery command file (recovery.conf), and if so
5137  * read in parameters for archive recovery and XLOG streaming.
5138  *
5139  * The file is parsed using the main configuration parser.
5140  */
5141 static void
5142 readRecoveryCommandFile(void)
5143 {
5144         FILE       *fd;
5145         TimeLineID      rtli = 0;
5146         bool            rtliGiven = false;
5147         ConfigVariable *item,
5148                            *head = NULL,
5149                            *tail = NULL;
5150         bool            recoveryTargetActionSet = false;
5151
5152
5153         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5154         if (fd == NULL)
5155         {
5156                 if (errno == ENOENT)
5157                         return;                         /* not there, so no archive recovery */
5158                 ereport(FATAL,
5159                                 (errcode_for_file_access(),
5160                                  errmsg("could not open recovery command file \"%s\": %m",
5161                                                 RECOVERY_COMMAND_FILE)));
5162         }
5163
5164         /*
5165          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5166          * no need to check the return value.
5167          */
5168         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5169
5170         FreeFile(fd);
5171
5172         for (item = head; item; item = item->next)
5173         {
5174                 if (strcmp(item->name, "restore_command") == 0)
5175                 {
5176                         recoveryRestoreCommand = pstrdup(item->value);
5177                         ereport(DEBUG2,
5178                                         (errmsg_internal("restore_command = '%s'",
5179                                                                          recoveryRestoreCommand)));
5180                 }
5181                 else if (strcmp(item->name, "recovery_end_command") == 0)
5182                 {
5183                         recoveryEndCommand = pstrdup(item->value);
5184                         ereport(DEBUG2,
5185                                         (errmsg_internal("recovery_end_command = '%s'",
5186                                                                          recoveryEndCommand)));
5187                 }
5188                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5189                 {
5190                         archiveCleanupCommand = pstrdup(item->value);
5191                         ereport(DEBUG2,
5192                                         (errmsg_internal("archive_cleanup_command = '%s'",
5193                                                                          archiveCleanupCommand)));
5194                 }
5195                 else if (strcmp(item->name, "recovery_target_action") == 0)
5196                 {
5197                         if (strcmp(item->value, "pause") == 0)
5198                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5199                         else if (strcmp(item->value, "promote") == 0)
5200                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5201                         else if (strcmp(item->value, "shutdown") == 0)
5202                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5203                         else
5204                                 ereport(ERROR,
5205                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5206                                 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5207                                            "recovery_target_action",
5208                                            item->value),
5209                                                  errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5210
5211                         ereport(DEBUG2,
5212                                         (errmsg_internal("recovery_target_action = '%s'",
5213                                                                          item->value)));
5214
5215                         recoveryTargetActionSet = true;
5216                 }
5217                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5218                 {
5219                         rtliGiven = true;
5220                         if (strcmp(item->value, "latest") == 0)
5221                                 rtli = 0;
5222                         else
5223                         {
5224                                 errno = 0;
5225                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5226                                 if (errno == EINVAL || errno == ERANGE)
5227                                         ereport(FATAL,
5228                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5229                                                          errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5230                                                                         item->value)));
5231                         }
5232                         if (rtli)
5233                                 ereport(DEBUG2,
5234                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5235                         else
5236                                 ereport(DEBUG2,
5237                                          (errmsg_internal("recovery_target_timeline = latest")));
5238                 }
5239                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5240                 {
5241                         errno = 0;
5242                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5243                         if (errno == EINVAL || errno == ERANGE)
5244                                 ereport(FATAL,
5245                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5246                                   errmsg("recovery_target_xid is not a valid number: \"%s\"",
5247                                                  item->value)));
5248                         ereport(DEBUG2,
5249                                         (errmsg_internal("recovery_target_xid = %u",
5250                                                                          recoveryTargetXid)));
5251                         recoveryTarget = RECOVERY_TARGET_XID;
5252                 }
5253                 else if (strcmp(item->name, "recovery_target_time") == 0)
5254                 {
5255                         recoveryTarget = RECOVERY_TARGET_TIME;
5256
5257                         /*
5258                          * Convert the time string given by the user to TimestampTz form.
5259                          */
5260                         recoveryTargetTime =
5261                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5262                                                                                                 CStringGetDatum(item->value),
5263                                                                                                 ObjectIdGetDatum(InvalidOid),
5264                                                                                                                 Int32GetDatum(-1)));
5265                         ereport(DEBUG2,
5266                                         (errmsg_internal("recovery_target_time = '%s'",
5267                                                                    timestamptz_to_str(recoveryTargetTime))));
5268                 }
5269                 else if (strcmp(item->name, "recovery_target_name") == 0)
5270                 {
5271                         recoveryTarget = RECOVERY_TARGET_NAME;
5272
5273                         recoveryTargetName = pstrdup(item->value);
5274                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5275                                 ereport(FATAL,
5276                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5277                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5278                                                                 MAXFNAMELEN - 1)));
5279
5280                         ereport(DEBUG2,
5281                                         (errmsg_internal("recovery_target_name = '%s'",
5282                                                                          recoveryTargetName)));
5283                 }
5284                 else if (strcmp(item->name, "recovery_target_lsn") == 0)
5285                 {
5286                         recoveryTarget = RECOVERY_TARGET_LSN;
5287
5288                         /*
5289                          * Convert the LSN string given by the user to XLogRecPtr form.
5290                          */
5291                         recoveryTargetLSN =
5292                                 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5293                                                                                                 CStringGetDatum(item->value),
5294                                                                                                 ObjectIdGetDatum(InvalidOid),
5295                                                                                                                 Int32GetDatum(-1)));
5296                         ereport(DEBUG2,
5297                                         (errmsg_internal("recovery_target_lsn = '%X/%X'",
5298                                                                          (uint32) (recoveryTargetLSN >> 32),
5299                                                                          (uint32) recoveryTargetLSN)));
5300                 }
5301                 else if (strcmp(item->name, "recovery_target") == 0)
5302                 {
5303                         if (strcmp(item->value, "immediate") == 0)
5304                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5305                         else
5306                                 ereport(ERROR,
5307                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5308                                 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5309                                            "recovery_target",
5310                                            item->value),
5311                                            errhint("The only allowed value is \"immediate\".")));
5312                         ereport(DEBUG2,
5313                                         (errmsg_internal("recovery_target = '%s'",
5314                                                                          item->value)));
5315                 }
5316                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5317                 {
5318                         /*
5319                          * does nothing if a recovery_target is not also set
5320                          */
5321                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5322                                 ereport(ERROR,
5323                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5324                                                  errmsg("parameter \"%s\" requires a Boolean value",
5325                                                                 "recovery_target_inclusive")));
5326                         ereport(DEBUG2,
5327                                         (errmsg_internal("recovery_target_inclusive = %s",
5328                                                                          item->value)));
5329                 }
5330                 else if (strcmp(item->name, "standby_mode") == 0)
5331                 {
5332                         if (!parse_bool(item->value, &StandbyModeRequested))
5333                                 ereport(ERROR,
5334                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5335                                                  errmsg("parameter \"%s\" requires a Boolean value",
5336                                                                 "standby_mode")));
5337                         ereport(DEBUG2,
5338                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5339                 }
5340                 else if (strcmp(item->name, "primary_conninfo") == 0)
5341                 {
5342                         PrimaryConnInfo = pstrdup(item->value);
5343                         ereport(DEBUG2,
5344                                         (errmsg_internal("primary_conninfo = '%s'",
5345                                                                          PrimaryConnInfo)));
5346                 }
5347                 else if (strcmp(item->name, "primary_slot_name") == 0)
5348                 {
5349                         ReplicationSlotValidateName(item->value, ERROR);
5350                         PrimarySlotName = pstrdup(item->value);
5351                         ereport(DEBUG2,
5352                                         (errmsg_internal("primary_slot_name = '%s'",
5353                                                                          PrimarySlotName)));
5354                 }
5355                 else if (strcmp(item->name, "trigger_file") == 0)
5356                 {
5357                         TriggerFile = pstrdup(item->value);
5358                         ereport(DEBUG2,
5359                                         (errmsg_internal("trigger_file = '%s'",
5360                                                                          TriggerFile)));
5361                 }
5362                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5363                 {
5364                         const char *hintmsg;
5365
5366                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5367                                                    &hintmsg))
5368                                 ereport(ERROR,
5369                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5370                                                  errmsg("parameter \"%s\" requires a temporal value",
5371                                                                 "recovery_min_apply_delay"),
5372                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5373                         ereport(DEBUG2,
5374                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5375                 }
5376                 else
5377                         ereport(FATAL,
5378                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5379                                          errmsg("unrecognized recovery parameter \"%s\"",
5380                                                         item->name)));
5381         }
5382
5383         /*
5384          * Check for compulsory parameters
5385          */
5386         if (StandbyModeRequested)
5387         {
5388                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5389                         ereport(WARNING,
5390                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5391                                                         RECOVERY_COMMAND_FILE),
5392                                          errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5393         }
5394         else
5395         {
5396                 if (recoveryRestoreCommand == NULL)
5397                         ereport(FATAL,
5398                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5399                                          errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5400                                                         RECOVERY_COMMAND_FILE)));
5401         }
5402
5403         /*
5404          * Override any inconsistent requests. Not that this is a change of
5405          * behaviour in 9.5; prior to this we simply ignored a request to pause if
5406          * hot_standby = off, which was surprising behaviour.
5407          */
5408         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5409                 recoveryTargetActionSet &&
5410                 !EnableHotStandby)
5411                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5412
5413         /*
5414          * We don't support standby_mode in standalone backends; that requires
5415          * other processes such as the WAL receiver to be alive.
5416          */
5417         if (StandbyModeRequested && !IsUnderPostmaster)
5418                 ereport(FATAL,
5419                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5420                         errmsg("standby mode is not supported by single-user servers")));
5421
5422         /* Enable fetching from archive recovery area */
5423         ArchiveRecoveryRequested = true;
5424
5425         /*
5426          * If user specified recovery_target_timeline, validate it or compute the
5427          * "latest" value.  We can't do this until after we've gotten the restore
5428          * command and set InArchiveRecovery, because we need to fetch timeline
5429          * history files from the archive.
5430          */
5431         if (rtliGiven)
5432         {
5433                 if (rtli)
5434                 {
5435                         /* Timeline 1 does not have a history file, all else should */
5436                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5437                                 ereport(FATAL,
5438                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5439                                                  errmsg("recovery target timeline %u does not exist",
5440                                                                 rtli)));
5441                         recoveryTargetTLI = rtli;
5442                         recoveryTargetIsLatest = false;
5443                 }
5444                 else
5445                 {
5446                         /* We start the "latest" search from pg_control's timeline */
5447                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5448                         recoveryTargetIsLatest = true;
5449                 }
5450         }
5451
5452         FreeConfigVariables(head);
5453 }
5454
5455 /*
5456  * Exit archive-recovery state
5457  */
5458 static void
5459 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5460 {
5461         char            recoveryPath[MAXPGPATH];
5462         char            xlogfname[MAXFNAMELEN];
5463         XLogSegNo       endLogSegNo;
5464         XLogSegNo       startLogSegNo;
5465
5466         /* we always switch to a new timeline after archive recovery */
5467         Assert(endTLI != ThisTimeLineID);
5468
5469         /*
5470          * We are no longer in archive recovery state.
5471          */
5472         InArchiveRecovery = false;
5473
5474         /*
5475          * Update min recovery point one last time.
5476          */
5477         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5478
5479         /*
5480          * If the ending log segment is still open, close it (to avoid problems on
5481          * Windows with trying to rename or delete an open file).
5482          */
5483         if (readFile >= 0)
5484         {
5485                 close(readFile);
5486                 readFile = -1;
5487         }
5488
5489         /*
5490          * Calculate the last segment on the old timeline, and the first segment
5491          * on the new timeline. If the switch happens in the middle of a segment,
5492          * they are the same, but if the switch happens exactly at a segment
5493          * boundary, startLogSegNo will be endLogSegNo + 1.
5494          */
5495         XLByteToPrevSeg(endOfLog, endLogSegNo);
5496         XLByteToSeg(endOfLog, startLogSegNo);
5497
5498         /*
5499          * Initialize the starting WAL segment for the new timeline. If the switch
5500          * happens in the middle of a segment, copy data from the last WAL segment
5501          * of the old timeline up to the switch point, to the starting WAL segment
5502          * on the new timeline.
5503          */
5504         if (endLogSegNo == startLogSegNo)
5505         {
5506                 /*
5507                  * Make a copy of the file on the new timeline.
5508                  *
5509                  * Writing WAL isn't allowed yet, so there are no locking
5510                  * considerations. But we should be just as tense as XLogFileInit to
5511                  * avoid emplacing a bogus file.
5512                  */
5513                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5514                                          endOfLog % XLOG_SEG_SIZE);
5515         }
5516         else
5517         {
5518                 /*
5519                  * The switch happened at a segment boundary, so just create the next
5520                  * segment on the new timeline.
5521                  */
5522                 bool            use_existent = true;
5523                 int                     fd;
5524
5525                 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5526
5527                 if (close(fd))
5528                         ereport(ERROR,
5529                                         (errcode_for_file_access(),
5530                                          errmsg("could not close log file %s: %m",
5531                                                         XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5532         }
5533
5534         /*
5535          * Let's just make real sure there are not .ready or .done flags posted
5536          * for the new segment.
5537          */
5538         XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
5539         XLogArchiveCleanup(xlogfname);
5540
5541         /*
5542          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5543          * of it.
5544          */
5545         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5546         unlink(recoveryPath);           /* ignore any error */
5547
5548         /* Get rid of any remaining recovered timeline-history file, too */
5549         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5550         unlink(recoveryPath);           /* ignore any error */
5551
5552         /*
5553          * Rename the config file out of the way, so that we don't accidentally
5554          * re-enter archive recovery mode in a subsequent crash.
5555          */
5556         unlink(RECOVERY_COMMAND_DONE);
5557         durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5558
5559         ereport(LOG,
5560                         (errmsg("archive recovery complete")));
5561 }
5562
5563 /*
5564  * Extract timestamp from WAL record.
5565  *
5566  * If the record contains a timestamp, returns true, and saves the timestamp
5567  * in *recordXtime. If the record type has no timestamp, returns false.
5568  * Currently, only transaction commit/abort records and restore points contain
5569  * timestamps.
5570  */
5571 static bool
5572 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5573 {
5574         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5575         uint8           xact_info = info & XLOG_XACT_OPMASK;
5576         uint8           rmid = XLogRecGetRmid(record);
5577
5578         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5579         {
5580                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5581                 return true;
5582         }
5583         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5584                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
5585         {
5586                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5587                 return true;
5588         }
5589         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5590                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
5591         {
5592                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5593                 return true;
5594         }
5595         return false;
5596 }
5597
5598 /*
5599  * For point-in-time recovery, this function decides whether we want to
5600  * stop applying the XLOG before the current record.
5601  *
5602  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5603  * information is saved in recoveryStopXid et al for use in annotating the
5604  * new timeline's history file.
5605  */
5606 static bool
5607 recoveryStopsBefore(XLogReaderState *record)
5608 {
5609         bool            stopsHere = false;
5610         uint8           xact_info;
5611         bool            isCommit;
5612         TimestampTz recordXtime = 0;
5613         TransactionId recordXid;
5614
5615         /* Check if we should stop as soon as reaching consistency */
5616         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5617         {
5618                 ereport(LOG,
5619                                 (errmsg("recovery stopping after reaching consistency")));
5620
5621                 recoveryStopAfter = false;
5622                 recoveryStopXid = InvalidTransactionId;
5623                 recoveryStopLSN = InvalidXLogRecPtr;
5624                 recoveryStopTime = 0;
5625                 recoveryStopName[0] = '\0';
5626                 return true;
5627         }
5628
5629         /* Check if target LSN has been reached */
5630         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5631                 !recoveryTargetInclusive &&
5632                 record->ReadRecPtr >= recoveryTargetLSN)
5633         {
5634                 recoveryStopAfter = false;
5635                 recoveryStopXid = InvalidTransactionId;
5636                 recoveryStopLSN = record->ReadRecPtr;
5637                 recoveryStopTime = 0;
5638                 recoveryStopName[0] = '\0';
5639                 ereport(LOG,
5640                                 (errmsg("recovery stopping before WAL position (LSN) \"%X/%X\"",
5641                                                 (uint32) (recoveryStopLSN >> 32),
5642                                                 (uint32) recoveryStopLSN)));
5643                 return true;
5644         }
5645
5646         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5647         if (XLogRecGetRmid(record) != RM_XACT_ID)
5648                 return false;
5649
5650         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5651
5652         if (xact_info == XLOG_XACT_COMMIT)
5653         {
5654                 isCommit = true;
5655                 recordXid = XLogRecGetXid(record);
5656         }
5657         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5658         {
5659                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5660                 xl_xact_parsed_commit parsed;
5661
5662                 isCommit = true;
5663                 ParseCommitRecord(XLogRecGetInfo(record),
5664                                                   xlrec,
5665                                                   &parsed);
5666                 recordXid = parsed.twophase_xid;
5667         }
5668         else if (xact_info == XLOG_XACT_ABORT)
5669         {
5670                 isCommit = false;
5671                 recordXid = XLogRecGetXid(record);
5672         }
5673         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5674         {
5675                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5676                 xl_xact_parsed_abort parsed;
5677
5678                 isCommit = true;
5679                 ParseAbortRecord(XLogRecGetInfo(record),
5680                                                  xlrec,
5681                                                  &parsed);
5682                 recordXid = parsed.twophase_xid;
5683         }
5684         else
5685                 return false;
5686
5687         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5688         {
5689                 /*
5690                  * There can be only one transaction end record with this exact
5691                  * transactionid
5692                  *
5693                  * when testing for an xid, we MUST test for equality only, since
5694                  * transactions are numbered in the order they start, not the order
5695                  * they complete. A higher numbered xid will complete before you about
5696                  * 50% of the time...
5697                  */
5698                 stopsHere = (recordXid == recoveryTargetXid);
5699         }
5700
5701         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5702                 getRecordTimestamp(record, &recordXtime))
5703         {
5704                 /*
5705                  * There can be many transactions that share the same commit time, so
5706                  * we stop after the last one, if we are inclusive, or stop at the
5707                  * first one if we are exclusive
5708                  */
5709                 if (recoveryTargetInclusive)
5710                         stopsHere = (recordXtime > recoveryTargetTime);
5711                 else
5712                         stopsHere = (recordXtime >= recoveryTargetTime);
5713         }
5714
5715         if (stopsHere)
5716         {
5717                 recoveryStopAfter = false;
5718                 recoveryStopXid = recordXid;
5719                 recoveryStopTime = recordXtime;
5720                 recoveryStopLSN = InvalidXLogRecPtr;
5721                 recoveryStopName[0] = '\0';
5722
5723                 if (isCommit)
5724                 {
5725                         ereport(LOG,
5726                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5727                                                         recoveryStopXid,
5728                                                         timestamptz_to_str(recoveryStopTime))));
5729                 }
5730                 else
5731                 {
5732                         ereport(LOG,
5733                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5734                                                         recoveryStopXid,
5735                                                         timestamptz_to_str(recoveryStopTime))));
5736                 }
5737         }
5738
5739         return stopsHere;
5740 }
5741
5742 /*
5743  * Same as recoveryStopsBefore, but called after applying the record.
5744  *
5745  * We also track the timestamp of the latest applied COMMIT/ABORT
5746  * record in XLogCtl->recoveryLastXTime.
5747  */
5748 static bool
5749 recoveryStopsAfter(XLogReaderState *record)
5750 {
5751         uint8           info;
5752         uint8           xact_info;
5753         uint8           rmid;
5754         TimestampTz recordXtime;
5755
5756         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5757         rmid = XLogRecGetRmid(record);
5758
5759         /*
5760          * There can be many restore points that share the same name; we stop at
5761          * the first one.
5762          */
5763         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5764                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5765         {
5766                 xl_restore_point *recordRestorePointData;
5767
5768                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5769
5770                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5771                 {
5772                         recoveryStopAfter = true;
5773                         recoveryStopXid = InvalidTransactionId;
5774                         recoveryStopLSN = InvalidXLogRecPtr;
5775                         (void) getRecordTimestamp(record, &recoveryStopTime);
5776                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5777
5778                         ereport(LOG,
5779                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5780                                                 recoveryStopName,
5781                                                 timestamptz_to_str(recoveryStopTime))));
5782                         return true;
5783                 }
5784         }
5785
5786         /* Check if the target LSN has been reached */
5787         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5788                 recoveryTargetInclusive &&
5789                 record->ReadRecPtr >= recoveryTargetLSN)
5790         {
5791                 recoveryStopAfter = true;
5792                 recoveryStopXid = InvalidTransactionId;
5793                 recoveryStopLSN = record->ReadRecPtr;
5794                 recoveryStopTime = 0;
5795                 recoveryStopName[0] = '\0';
5796                 ereport(LOG,
5797                                 (errmsg("recovery stopping after WAL position (LSN) \"%X/%X\"",
5798                                                 (uint32) (recoveryStopLSN >> 32),
5799                                                 (uint32) recoveryStopLSN)));
5800                 return true;
5801         }
5802
5803         if (rmid != RM_XACT_ID)
5804                 return false;
5805
5806         xact_info = info & XLOG_XACT_OPMASK;
5807
5808         if (xact_info == XLOG_XACT_COMMIT ||
5809                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5810                 xact_info == XLOG_XACT_ABORT ||
5811                 xact_info == XLOG_XACT_ABORT_PREPARED)
5812         {
5813                 TransactionId recordXid;
5814
5815                 /* Update the last applied transaction timestamp */
5816                 if (getRecordTimestamp(record, &recordXtime))
5817                         SetLatestXTime(recordXtime);
5818
5819                 /* Extract the XID of the committed/aborted transaction */
5820                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5821                 {
5822                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5823                         xl_xact_parsed_commit parsed;
5824
5825                         ParseCommitRecord(XLogRecGetInfo(record),
5826                                                           xlrec,
5827                                                           &parsed);
5828                         recordXid = parsed.twophase_xid;
5829                 }
5830                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5831                 {
5832                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5833                         xl_xact_parsed_abort parsed;
5834
5835                         ParseAbortRecord(XLogRecGetInfo(record),
5836                                                          xlrec,
5837                                                          &parsed);
5838                         recordXid = parsed.twophase_xid;
5839                 }
5840                 else
5841                         recordXid = XLogRecGetXid(record);
5842
5843                 /*
5844                  * There can be only one transaction end record with this exact
5845                  * transactionid
5846                  *
5847                  * when testing for an xid, we MUST test for equality only, since
5848                  * transactions are numbered in the order they start, not the order
5849                  * they complete. A higher numbered xid will complete before you about
5850                  * 50% of the time...
5851                  */
5852                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5853                         recordXid == recoveryTargetXid)
5854                 {
5855                         recoveryStopAfter = true;
5856                         recoveryStopXid = recordXid;
5857                         recoveryStopTime = recordXtime;
5858                         recoveryStopLSN = InvalidXLogRecPtr;
5859                         recoveryStopName[0] = '\0';
5860
5861                         if (xact_info == XLOG_XACT_COMMIT ||
5862                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
5863                         {
5864                                 ereport(LOG,
5865                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5866                                                                 recoveryStopXid,
5867                                                                 timestamptz_to_str(recoveryStopTime))));
5868                         }
5869                         else if (xact_info == XLOG_XACT_ABORT ||
5870                                          xact_info == XLOG_XACT_ABORT_PREPARED)
5871                         {
5872                                 ereport(LOG,
5873                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5874                                                                 recoveryStopXid,
5875                                                                 timestamptz_to_str(recoveryStopTime))));
5876                         }
5877                         return true;
5878                 }
5879         }
5880
5881         /* Check if we should stop as soon as reaching consistency */
5882         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5883         {
5884                 ereport(LOG,
5885                                 (errmsg("recovery stopping after reaching consistency")));
5886
5887                 recoveryStopAfter = true;
5888                 recoveryStopXid = InvalidTransactionId;
5889                 recoveryStopTime = 0;
5890                 recoveryStopLSN = InvalidXLogRecPtr;
5891                 recoveryStopName[0] = '\0';
5892                 return true;
5893         }
5894
5895         return false;
5896 }
5897
5898 /*
5899  * Wait until shared recoveryPause flag is cleared.
5900  *
5901  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5902  * Probably not worth the trouble though.  This state shouldn't be one that
5903  * anyone cares about server power consumption in.
5904  */
5905 static void
5906 recoveryPausesHere(void)
5907 {
5908         /* Don't pause unless users can connect! */
5909         if (!LocalHotStandbyActive)
5910                 return;
5911
5912         ereport(LOG,
5913                         (errmsg("recovery has paused"),
5914                          errhint("Execute pg_wal_replay_resume() to continue.")));
5915
5916         while (RecoveryIsPaused())
5917         {
5918                 pg_usleep(1000000L);    /* 1000 ms */
5919                 HandleStartupProcInterrupts();
5920         }
5921 }
5922
5923 bool
5924 RecoveryIsPaused(void)
5925 {
5926         bool            recoveryPause;
5927
5928         SpinLockAcquire(&XLogCtl->info_lck);
5929         recoveryPause = XLogCtl->recoveryPause;
5930         SpinLockRelease(&XLogCtl->info_lck);
5931
5932         return recoveryPause;
5933 }
5934
5935 void
5936 SetRecoveryPause(bool recoveryPause)
5937 {
5938         SpinLockAcquire(&XLogCtl->info_lck);
5939         XLogCtl->recoveryPause = recoveryPause;
5940         SpinLockRelease(&XLogCtl->info_lck);
5941 }
5942
5943 /*
5944  * When recovery_min_apply_delay is set, we wait long enough to make sure
5945  * certain record types are applied at least that interval behind the master.
5946  *
5947  * Returns true if we waited.
5948  *
5949  * Note that the delay is calculated between the WAL record log time and
5950  * the current time on standby. We would prefer to keep track of when this
5951  * standby received each WAL record, which would allow a more consistent
5952  * approach and one not affected by time synchronisation issues, but that
5953  * is significantly more effort and complexity for little actual gain in
5954  * usability.
5955  */
5956 static bool
5957 recoveryApplyDelay(XLogReaderState *record)
5958 {
5959         uint8           xact_info;
5960         TimestampTz xtime;
5961         long            secs;
5962         int                     microsecs;
5963
5964         /* nothing to do if no delay configured */
5965         if (recovery_min_apply_delay <= 0)
5966                 return false;
5967
5968         /* no delay is applied on a database not yet consistent */
5969         if (!reachedConsistency)
5970                 return false;
5971
5972         /*
5973          * Is it a COMMIT record?
5974          *
5975          * We deliberately choose not to delay aborts since they have no effect on
5976          * MVCC. We already allow replay of records that don't have a timestamp,
5977          * so there is already opportunity for issues caused by early conflicts on
5978          * standbys.
5979          */
5980         if (XLogRecGetRmid(record) != RM_XACT_ID)
5981                 return false;
5982
5983         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5984
5985         if (xact_info != XLOG_XACT_COMMIT &&
5986                 xact_info != XLOG_XACT_COMMIT_PREPARED)
5987                 return false;
5988
5989         if (!getRecordTimestamp(record, &xtime))
5990                 return false;
5991
5992         recoveryDelayUntilTime =
5993                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5994
5995         /*
5996          * Exit without arming the latch if it's already past time to apply this
5997          * record
5998          */
5999         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6000                                                 &secs, &microsecs);
6001         if (secs <= 0 && microsecs <= 0)
6002                 return false;
6003
6004         while (true)
6005         {
6006                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6007
6008                 /* might change the trigger file's location */
6009                 HandleStartupProcInterrupts();
6010
6011                 if (CheckForStandbyTrigger())
6012                         break;
6013
6014                 /*
6015                  * Wait for difference between GetCurrentTimestamp() and
6016                  * recoveryDelayUntilTime
6017                  */
6018                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6019                                                         &secs, &microsecs);
6020
6021                 /* NB: We're ignoring waits below min_apply_delay's resolution. */
6022                 if (secs <= 0 && microsecs / 1000 <= 0)
6023                         break;
6024
6025                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6026                          secs, microsecs / 1000);
6027
6028                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6029                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6030                                   secs * 1000L + microsecs / 1000,
6031                                   WAIT_EVENT_RECOVERY_APPLY_DELAY);
6032         }
6033         return true;
6034 }
6035
6036 /*
6037  * Save timestamp of latest processed commit/abort record.
6038  *
6039  * We keep this in XLogCtl, not a simple static variable, so that it can be
6040  * seen by processes other than the startup process.  Note in particular
6041  * that CreateRestartPoint is executed in the checkpointer.
6042  */
6043 static void
6044 SetLatestXTime(TimestampTz xtime)
6045 {
6046         SpinLockAcquire(&XLogCtl->info_lck);
6047         XLogCtl->recoveryLastXTime = xtime;
6048         SpinLockRelease(&XLogCtl->info_lck);
6049 }
6050
6051 /*
6052  * Fetch timestamp of latest processed commit/abort record.
6053  */
6054 TimestampTz
6055 GetLatestXTime(void)
6056 {
6057         TimestampTz xtime;
6058
6059         SpinLockAcquire(&XLogCtl->info_lck);
6060         xtime = XLogCtl->recoveryLastXTime;
6061         SpinLockRelease(&XLogCtl->info_lck);
6062
6063         return xtime;
6064 }
6065
6066 /*
6067  * Save timestamp of the next chunk of WAL records to apply.
6068  *
6069  * We keep this in XLogCtl, not a simple static variable, so that it can be
6070  * seen by all backends.
6071  */
6072 static void
6073 SetCurrentChunkStartTime(TimestampTz xtime)
6074 {
6075         SpinLockAcquire(&XLogCtl->info_lck);
6076         XLogCtl->currentChunkStartTime = xtime;
6077         SpinLockRelease(&XLogCtl->info_lck);
6078 }
6079
6080 /*
6081  * Fetch timestamp of latest processed commit/abort record.
6082  * Startup process maintains an accurate local copy in XLogReceiptTime
6083  */
6084 TimestampTz
6085 GetCurrentChunkReplayStartTime(void)
6086 {
6087         TimestampTz xtime;
6088
6089         SpinLockAcquire(&XLogCtl->info_lck);
6090         xtime = XLogCtl->currentChunkStartTime;
6091         SpinLockRelease(&XLogCtl->info_lck);
6092
6093         return xtime;
6094 }
6095
6096 /*
6097  * Returns time of receipt of current chunk of XLOG data, as well as
6098  * whether it was received from streaming replication or from archives.
6099  */
6100 void
6101 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6102 {
6103         /*
6104          * This must be executed in the startup process, since we don't export the
6105          * relevant state to shared memory.
6106          */
6107         Assert(InRecovery);
6108
6109         *rtime = XLogReceiptTime;
6110         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6111 }
6112
6113 /*
6114  * Note that text field supplied is a parameter name and does not require
6115  * translation
6116  */
6117 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6118 do { \
6119         if ((currValue) < (minValue)) \
6120                 ereport(ERROR, \
6121                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6122                                  errmsg("hot standby is not possible because " \
6123                                                 "%s = %d is a lower setting than on the master server " \
6124                                                 "(its value was %d)", \
6125                                                 param_name, \
6126                                                 currValue, \
6127                                                 minValue))); \
6128 } while(0)
6129
6130 /*
6131  * Check to see if required parameters are set high enough on this server
6132  * for various aspects of recovery operation.
6133  *
6134  * Note that all the parameters which this function tests need to be
6135  * listed in Administrator's Overview section in high-availability.sgml.
6136  * If you change them, don't forget to update the list.
6137  */
6138 static void
6139 CheckRequiredParameterValues(void)
6140 {
6141         /*
6142          * For archive recovery, the WAL must be generated with at least 'replica'
6143          * wal_level.
6144          */
6145         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6146         {
6147                 ereport(WARNING,
6148                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6149                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6150         }
6151
6152         /*
6153          * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6154          * must have at least as many backend slots as the primary.
6155          */
6156         if (ArchiveRecoveryRequested && EnableHotStandby)
6157         {
6158                 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6159                         ereport(ERROR,
6160                                         (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6161                                          errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6162
6163                 /* We ignore autovacuum_max_workers when we make this test. */
6164                 RecoveryRequiresIntParameter("max_connections",
6165                                                                          MaxConnections,
6166                                                                          ControlFile->MaxConnections);
6167                 RecoveryRequiresIntParameter("max_worker_processes",
6168                                                                          max_worker_processes,
6169                                                                          ControlFile->max_worker_processes);
6170                 RecoveryRequiresIntParameter("max_prepared_transactions",
6171                                                                          max_prepared_xacts,
6172                                                                          ControlFile->max_prepared_xacts);
6173                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6174                                                                          max_locks_per_xact,
6175                                                                          ControlFile->max_locks_per_xact);
6176         }
6177 }
6178
6179 /*
6180  * This must be called ONCE during postmaster or standalone-backend startup
6181  */
6182 void
6183 StartupXLOG(void)
6184 {
6185         XLogCtlInsert *Insert;
6186         CheckPoint      checkPoint;
6187         bool            wasShutdown;
6188         bool            reachedStopPoint = false;
6189         bool            haveBackupLabel = false;
6190         bool            haveTblspcMap = false;
6191         XLogRecPtr      RecPtr,
6192                                 checkPointLoc,
6193                                 EndOfLog;
6194         TimeLineID      EndOfLogTLI;
6195         TimeLineID      PrevTimeLineID;
6196         XLogRecord *record;
6197         TransactionId oldestActiveXID;
6198         bool            backupEndRequired = false;
6199         bool            backupFromStandby = false;
6200         DBState         dbstate_at_startup;
6201         XLogReaderState *xlogreader;
6202         XLogPageReadPrivate private;
6203         bool            fast_promoted = false;
6204         struct stat st;
6205
6206         /*
6207          * Read control file and check XLOG status looks valid.
6208          *
6209          * Note: in most control paths, *ControlFile is already valid and we need
6210          * not do ReadControlFile() here, but might as well do it to be sure.
6211          */
6212         ReadControlFile();
6213
6214         if (ControlFile->state < DB_SHUTDOWNED ||
6215                 ControlFile->state > DB_IN_PRODUCTION ||
6216                 !XRecOffIsValid(ControlFile->checkPoint))
6217                 ereport(FATAL,
6218                                 (errmsg("control file contains invalid data")));
6219
6220         if (ControlFile->state == DB_SHUTDOWNED)
6221         {
6222                 /* This is the expected case, so don't be chatty in standalone mode */
6223                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6224                                 (errmsg("database system was shut down at %s",
6225                                                 str_time(ControlFile->time))));
6226         }
6227         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6228                 ereport(LOG,
6229                                 (errmsg("database system was shut down in recovery at %s",
6230                                                 str_time(ControlFile->time))));
6231         else if (ControlFile->state == DB_SHUTDOWNING)
6232                 ereport(LOG,
6233                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6234                                                 str_time(ControlFile->time))));
6235         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6236                 ereport(LOG,
6237                    (errmsg("database system was interrupted while in recovery at %s",
6238                                    str_time(ControlFile->time)),
6239                         errhint("This probably means that some data is corrupted and"
6240                                         " you will have to use the last backup for recovery.")));
6241         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6242                 ereport(LOG,
6243                                 (errmsg("database system was interrupted while in recovery at log time %s",
6244                                                 str_time(ControlFile->checkPointCopy.time)),
6245                                  errhint("If this has occurred more than once some data might be corrupted"
6246                           " and you might need to choose an earlier recovery target.")));
6247         else if (ControlFile->state == DB_IN_PRODUCTION)
6248                 ereport(LOG,
6249                           (errmsg("database system was interrupted; last known up at %s",
6250                                           str_time(ControlFile->time))));
6251
6252         /* This is just to allow attaching to startup process with a debugger */
6253 #ifdef XLOG_REPLAY_DELAY
6254         if (ControlFile->state != DB_SHUTDOWNED)
6255                 pg_usleep(60000000L);
6256 #endif
6257
6258         /*
6259          * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6260          * someone has performed a copy for PITR, these directories may have been
6261          * excluded and need to be re-created.
6262          */
6263         ValidateXLOGDirectoryStructure();
6264
6265         /*
6266          * If we previously crashed, there might be data which we had written,
6267          * intending to fsync it, but which we had not actually fsync'd yet.
6268          * Therefore, a power failure in the near future might cause earlier
6269          * unflushed writes to be lost, even though more recent data written to
6270          * disk from here on would be persisted.  To avoid that, fsync the entire
6271          * data directory.
6272          */
6273         if (ControlFile->state != DB_SHUTDOWNED &&
6274                 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6275                 SyncDataDirectory();
6276
6277         /*
6278          * Initialize on the assumption we want to recover to the latest timeline
6279          * that's active according to pg_control.
6280          */
6281         if (ControlFile->minRecoveryPointTLI >
6282                 ControlFile->checkPointCopy.ThisTimeLineID)
6283                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6284         else
6285                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6286
6287         /*
6288          * Check for recovery control file, and if so set up state for offline
6289          * recovery
6290          */
6291         readRecoveryCommandFile();
6292
6293         /*
6294          * Save archive_cleanup_command in shared memory so that other processes
6295          * can see it.
6296          */
6297         strlcpy(XLogCtl->archiveCleanupCommand,
6298                         archiveCleanupCommand ? archiveCleanupCommand : "",
6299                         sizeof(XLogCtl->archiveCleanupCommand));
6300
6301         if (ArchiveRecoveryRequested)
6302         {
6303                 if (StandbyModeRequested)
6304                         ereport(LOG,
6305                                         (errmsg("entering standby mode")));
6306                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6307                         ereport(LOG,
6308                                         (errmsg("starting point-in-time recovery to XID %u",
6309                                                         recoveryTargetXid)));
6310                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6311                         ereport(LOG,
6312                                         (errmsg("starting point-in-time recovery to %s",
6313                                                         timestamptz_to_str(recoveryTargetTime))));
6314                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6315                         ereport(LOG,
6316                                         (errmsg("starting point-in-time recovery to \"%s\"",
6317                                                         recoveryTargetName)));
6318                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6319                         ereport(LOG,
6320                                         (errmsg("starting point-in-time recovery to WAL position (LSN) \"%X/%X\"",
6321                                                         (uint32) (recoveryTargetLSN >> 32),
6322                                                         (uint32) recoveryTargetLSN)));
6323                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6324                         ereport(LOG,
6325                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6326                 else
6327                         ereport(LOG,
6328                                         (errmsg("starting archive recovery")));
6329         }
6330
6331         /*
6332          * Take ownership of the wakeup latch if we're going to sleep during
6333          * recovery.
6334          */
6335         if (StandbyModeRequested)
6336                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6337
6338         /* Set up XLOG reader facility */
6339         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6340         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6341         if (!xlogreader)
6342                 ereport(ERROR,
6343                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6344                                  errmsg("out of memory"),
6345                    errdetail("Failed while allocating a WAL reading processor.")));
6346         xlogreader->system_identifier = ControlFile->system_identifier;
6347
6348         /*
6349          * Allocate pages dedicated to WAL consistency checks, those had better
6350          * be aligned.
6351          */
6352         replay_image_masked = (char *) palloc(BLCKSZ);
6353         master_image_masked = (char *) palloc(BLCKSZ);
6354
6355         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6356                                                   &backupFromStandby))
6357         {
6358                 List       *tablespaces = NIL;
6359
6360                 /*
6361                  * Archive recovery was requested, and thanks to the backup label
6362                  * file, we know how far we need to replay to reach consistency. Enter
6363                  * archive recovery directly.
6364                  */
6365                 InArchiveRecovery = true;
6366                 if (StandbyModeRequested)
6367                         StandbyMode = true;
6368
6369                 /*
6370                  * When a backup_label file is present, we want to roll forward from
6371                  * the checkpoint it identifies, rather than using pg_control.
6372                  */
6373                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6374                 if (record != NULL)
6375                 {
6376                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6377                         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6378                         ereport(DEBUG1,
6379                                         (errmsg("checkpoint record is at %X/%X",
6380                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6381                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6382
6383                         /*
6384                          * Make sure that REDO location exists. This may not be the case
6385                          * if there was a crash during an online backup, which left a
6386                          * backup_label around that references a WAL segment that's
6387                          * already been archived.
6388                          */
6389                         if (checkPoint.redo < checkPointLoc)
6390                         {
6391                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6392                                         ereport(FATAL,
6393                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6394                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6395                         }
6396                 }
6397                 else
6398                 {
6399                         ereport(FATAL,
6400                                         (errmsg("could not locate required checkpoint record"),
6401                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6402                         wasShutdown = false;    /* keep compiler quiet */
6403                 }
6404
6405                 /* read the tablespace_map file if present and create symlinks. */
6406                 if (read_tablespace_map(&tablespaces))
6407                 {
6408                         ListCell   *lc;
6409
6410                         foreach(lc, tablespaces)
6411                         {
6412                                 tablespaceinfo *ti = lfirst(lc);
6413                                 char       *linkloc;
6414
6415                                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6416
6417                                 /*
6418                                  * Remove the existing symlink if any and Create the symlink
6419                                  * under PGDATA.
6420                                  */
6421                                 remove_tablespace_symlink(linkloc);
6422
6423                                 if (symlink(ti->path, linkloc) < 0)
6424                                         ereport(ERROR,
6425                                                         (errcode_for_file_access(),
6426                                                   errmsg("could not create symbolic link \"%s\": %m",
6427                                                                  linkloc)));
6428
6429                                 pfree(ti->oid);
6430                                 pfree(ti->path);
6431                                 pfree(ti);
6432                         }
6433
6434                         /* set flag to delete it later */
6435                         haveTblspcMap = true;
6436                 }
6437
6438                 /* set flag to delete it later */
6439                 haveBackupLabel = true;
6440         }
6441         else
6442         {
6443                 /*
6444                  * If tablespace_map file is present without backup_label file, there
6445                  * is no use of such file.  There is no harm in retaining it, but it
6446                  * is better to get rid of the map file so that we don't have any
6447                  * redundant file in data directory and it will avoid any sort of
6448                  * confusion.  It seems prudent though to just rename the file out of
6449                  * the way rather than delete it completely, also we ignore any error
6450                  * that occurs in rename operation as even if map file is present
6451                  * without backup_label file, it is harmless.
6452                  */
6453                 if (stat(TABLESPACE_MAP, &st) == 0)
6454                 {
6455                         unlink(TABLESPACE_MAP_OLD);
6456                         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6457                                 ereport(LOG,
6458                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6459                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6460                                  errdetail("File \"%s\" was renamed to \"%s\".",
6461                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6462                         else
6463                                 ereport(LOG,
6464                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6465                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6466                                  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6467                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6468                 }
6469
6470                 /*
6471                  * It's possible that archive recovery was requested, but we don't
6472                  * know how far we need to replay the WAL before we reach consistency.
6473                  * This can happen for example if a base backup is taken from a
6474                  * running server using an atomic filesystem snapshot, without calling
6475                  * pg_start/stop_backup. Or if you just kill a running master server
6476                  * and put it into archive recovery by creating a recovery.conf file.
6477                  *
6478                  * Our strategy in that case is to perform crash recovery first,
6479                  * replaying all the WAL present in pg_wal, and only enter archive
6480                  * recovery after that.
6481                  *
6482                  * But usually we already know how far we need to replay the WAL (up
6483                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6484                  * end-of-backup record), and we can enter archive recovery directly.
6485                  */
6486                 if (ArchiveRecoveryRequested &&
6487                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6488                          ControlFile->backupEndRequired ||
6489                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6490                          ControlFile->state == DB_SHUTDOWNED))
6491                 {
6492                         InArchiveRecovery = true;
6493                         if (StandbyModeRequested)
6494                                 StandbyMode = true;
6495                 }
6496
6497                 /*
6498                  * Get the last valid checkpoint record.  If the latest one according
6499                  * to pg_control is broken, try the next-to-last one.
6500                  */
6501                 checkPointLoc = ControlFile->checkPoint;
6502                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6503                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6504                 if (record != NULL)
6505                 {
6506                         ereport(DEBUG1,
6507                                         (errmsg("checkpoint record is at %X/%X",
6508                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6509                 }
6510                 else if (StandbyMode)
6511                 {
6512                         /*
6513                          * The last valid checkpoint record required for a streaming
6514                          * recovery exists in neither standby nor the primary.
6515                          */
6516                         ereport(PANIC,
6517                                         (errmsg("could not locate a valid checkpoint record")));
6518                 }
6519                 else
6520                 {
6521                         checkPointLoc = ControlFile->prevCheckPoint;
6522                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6523                         if (record != NULL)
6524                         {
6525                                 ereport(LOG,
6526                                                 (errmsg("using previous checkpoint record at %X/%X",
6527                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6528                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6529                         }
6530                         else
6531                                 ereport(PANIC,
6532                                          (errmsg("could not locate a valid checkpoint record")));
6533                 }
6534                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6535                 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6536         }
6537
6538         /*
6539          * Clear out any old relcache cache files.  This is *necessary* if we do
6540          * any WAL replay, since that would probably result in the cache files
6541          * being out of sync with database reality.  In theory we could leave them
6542          * in place if the database had been cleanly shut down, but it seems
6543          * safest to just remove them always and let them be rebuilt during the
6544          * first backend startup.  These files needs to be removed from all
6545          * directories including pg_tblspc, however the symlinks are created only
6546          * after reading tablespace_map file in case of archive recovery from
6547          * backup, so needs to clear old relcache files here after creating
6548          * symlinks.
6549          */
6550         RelationCacheInitFileRemove();
6551
6552         /*
6553          * If the location of the checkpoint record is not on the expected
6554          * timeline in the history of the requested timeline, we cannot proceed:
6555          * the backup is not part of the history of the requested timeline.
6556          */
6557         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6558                                                                  * record */
6559         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6560                 checkPoint.ThisTimeLineID)
6561         {
6562                 XLogRecPtr      switchpoint;
6563
6564                 /*
6565                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6566                  * not in expectedTLEs at all.
6567                  */
6568                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6569                 ereport(FATAL,
6570                                 (errmsg("requested timeline %u is not a child of this server's history",
6571                                                 recoveryTargetTLI),
6572                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6573                                                    (uint32) (ControlFile->checkPoint >> 32),
6574                                                    (uint32) ControlFile->checkPoint,
6575                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6576                                                    (uint32) (switchpoint >> 32),
6577                                                    (uint32) switchpoint)));
6578         }
6579
6580         /*
6581          * The min recovery point should be part of the requested timeline's
6582          * history, too.
6583          */
6584         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6585           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6586                 ControlFile->minRecoveryPointTLI)
6587                 ereport(FATAL,
6588                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6589                                                 recoveryTargetTLI,
6590                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6591                                                 (uint32) ControlFile->minRecoveryPoint,
6592                                                 ControlFile->minRecoveryPointTLI)));
6593
6594         LastRec = RecPtr = checkPointLoc;
6595
6596         ereport(DEBUG1,
6597                         (errmsg_internal("redo record is at %X/%X; shutdown %s",
6598                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6599                                                          wasShutdown ? "TRUE" : "FALSE")));
6600         ereport(DEBUG1,
6601                         (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6602                                                          checkPoint.nextXidEpoch, checkPoint.nextXid,
6603                                                          checkPoint.nextOid)));
6604         ereport(DEBUG1,
6605                         (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6606                                                  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6607         ereport(DEBUG1,
6608            (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6609                                                 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6610         ereport(DEBUG1,
6611                         (errmsg_internal("oldest MultiXactId: %u, in database %u",
6612                                                  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6613         ereport(DEBUG1,
6614                         (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6615                                                          checkPoint.oldestCommitTsXid,
6616                                                          checkPoint.newestCommitTsXid)));
6617         if (!TransactionIdIsNormal(checkPoint.nextXid))
6618                 ereport(PANIC,
6619                                 (errmsg("invalid next transaction ID")));
6620
6621         /* initialize shared memory variables from the checkpoint record */
6622         ShmemVariableCache->nextXid = checkPoint.nextXid;
6623         ShmemVariableCache->nextOid = checkPoint.nextOid;
6624         ShmemVariableCache->oidCount = 0;
6625         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6626         AdvanceOldestClogXid(checkPoint.oldestXid);
6627         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6628         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6629         SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6630                                          checkPoint.newestCommitTsXid);
6631         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6632         XLogCtl->ckptXid = checkPoint.nextXid;
6633
6634         /*
6635          * Initialize replication slots, before there's a chance to remove
6636          * required resources.
6637          */
6638         StartupReplicationSlots();
6639
6640         /*
6641          * Startup logical state, needs to be setup now so we have proper data
6642          * during crash recovery.
6643          */
6644         StartupReorderBuffer();
6645
6646         /*
6647          * Startup MultiXact. We need to do this early to be able to replay
6648          * truncations.
6649          */
6650         StartupMultiXact();
6651
6652         /*
6653          * Ditto commit timestamps.  In a standby, we do it if setting is enabled
6654          * in ControlFile; in a master we base the decision on the GUC itself.
6655          */
6656         if (ArchiveRecoveryRequested ?
6657                 ControlFile->track_commit_timestamp : track_commit_timestamp)
6658                 StartupCommitTs();
6659
6660         /*
6661          * Recover knowledge about replay progress of known replication partners.
6662          */
6663         StartupReplicationOrigin();
6664
6665         /*
6666          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6667          * control file. On recovery, all unlogged relations are blown away, so
6668          * the unlogged LSN counter can be reset too.
6669          */
6670         if (ControlFile->state == DB_SHUTDOWNED)
6671                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6672         else
6673                 XLogCtl->unloggedLSN = 1;
6674
6675         /*
6676          * We must replay WAL entries using the same TimeLineID they were created
6677          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6678          * also xlog_redo()).
6679          */
6680         ThisTimeLineID = checkPoint.ThisTimeLineID;
6681
6682         /*
6683          * Copy any missing timeline history files between 'now' and the recovery
6684          * target timeline from archive to pg_wal. While we don't need those
6685          * files ourselves - the history file of the recovery target timeline
6686          * covers all the previous timelines in the history too - a cascading
6687          * standby server might be interested in them. Or, if you archive the WAL
6688          * from this server to a different archive than the master, it'd be good
6689          * for all the history files to get archived there after failover, so that
6690          * you can use one of the old timelines as a PITR target. Timeline history
6691          * files are small, so it's better to copy them unnecessarily than not
6692          * copy them and regret later.
6693          */
6694         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6695
6696         lastFullPageWrites = checkPoint.fullPageWrites;
6697
6698         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6699         doPageWrites = lastFullPageWrites;
6700
6701         if (RecPtr < checkPoint.redo)
6702                 ereport(PANIC,
6703                                 (errmsg("invalid redo in checkpoint record")));
6704
6705         /*
6706          * Check whether we need to force recovery from WAL.  If it appears to
6707          * have been a clean shutdown and we did not have a recovery.conf file,
6708          * then assume no recovery needed.
6709          */
6710         if (checkPoint.redo < RecPtr)
6711         {
6712                 if (wasShutdown)
6713                         ereport(PANIC,
6714                                         (errmsg("invalid redo record in shutdown checkpoint")));
6715                 InRecovery = true;
6716         }
6717         else if (ControlFile->state != DB_SHUTDOWNED)
6718                 InRecovery = true;
6719         else if (ArchiveRecoveryRequested)
6720         {
6721                 /* force recovery due to presence of recovery.conf */
6722                 InRecovery = true;
6723         }
6724
6725         /* REDO */
6726         if (InRecovery)
6727         {
6728                 int                     rmid;
6729
6730                 /*
6731                  * Update pg_control to show that we are recovering and to show the
6732                  * selected checkpoint as the place we are starting from. We also mark
6733                  * pg_control with any minimum recovery stop point obtained from a
6734                  * backup history file.
6735                  */
6736                 dbstate_at_startup = ControlFile->state;
6737                 if (InArchiveRecovery)
6738                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6739                 else
6740                 {
6741                         ereport(LOG,
6742                                         (errmsg("database system was not properly shut down; "
6743                                                         "automatic recovery in progress")));
6744                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6745                                 ereport(LOG,
6746                                                 (errmsg("crash recovery starts in timeline %u "
6747                                                                 "and has target timeline %u",
6748                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6749                                                                 recoveryTargetTLI)));
6750                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6751                 }
6752                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6753                 ControlFile->checkPoint = checkPointLoc;
6754                 ControlFile->checkPointCopy = checkPoint;
6755                 if (InArchiveRecovery)
6756                 {
6757                         /* initialize minRecoveryPoint if not set yet */
6758                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6759                         {
6760                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6761                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6762                         }
6763                 }
6764
6765                 /*
6766                  * Set backupStartPoint if we're starting recovery from a base backup.
6767                  *
6768                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
6769                  * location if we're starting recovery from a base backup which was
6770                  * taken from a standby. In this case, the database system status in
6771                  * pg_control must indicate that the database was already in recovery.
6772                  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6773                  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6774                  * before reaching this point; e.g. because restore_command or
6775                  * primary_conninfo were faulty.
6776                  *
6777                  * Any other state indicates that the backup somehow became corrupted
6778                  * and we can't sensibly continue with recovery.
6779                  */
6780                 if (haveBackupLabel)
6781                 {
6782                         ControlFile->backupStartPoint = checkPoint.redo;
6783                         ControlFile->backupEndRequired = backupEndRequired;
6784
6785                         if (backupFromStandby)
6786                         {
6787                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6788                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6789                                         ereport(FATAL,
6790                                                         (errmsg("backup_label contains data inconsistent with control file"),
6791                                                          errhint("This means that the backup is corrupted and you will "
6792                                                            "have to use another backup for recovery.")));
6793                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6794                         }
6795                 }
6796                 ControlFile->time = (pg_time_t) time(NULL);
6797                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6798                 UpdateControlFile();
6799
6800                 /* initialize our local copy of minRecoveryPoint */
6801                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6802                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6803
6804                 /*
6805                  * Reset pgstat data, because it may be invalid after recovery.
6806                  */
6807                 pgstat_reset_all();
6808
6809                 /*
6810                  * If there was a backup label file, it's done its job and the info
6811                  * has now been propagated into pg_control.  We must get rid of the
6812                  * label file so that if we crash during recovery, we'll pick up at
6813                  * the latest recovery restartpoint instead of going all the way back
6814                  * to the backup start point.  It seems prudent though to just rename
6815                  * the file out of the way rather than delete it completely.
6816                  */
6817                 if (haveBackupLabel)
6818                 {
6819                         unlink(BACKUP_LABEL_OLD);
6820                         durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6821                 }
6822
6823                 /*
6824                  * If there was a tablespace_map file, it's done its job and the
6825                  * symlinks have been created.  We must get rid of the map file so
6826                  * that if we crash during recovery, we don't create symlinks again.
6827                  * It seems prudent though to just rename the file out of the way
6828                  * rather than delete it completely.
6829                  */
6830                 if (haveTblspcMap)
6831                 {
6832                         unlink(TABLESPACE_MAP_OLD);
6833                         durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6834                 }
6835
6836                 /* Check that the GUCs used to generate the WAL allow recovery */
6837                 CheckRequiredParameterValues();
6838
6839                 /*
6840                  * We're in recovery, so unlogged relations may be trashed and must be
6841                  * reset.  This should be done BEFORE allowing Hot Standby
6842                  * connections, so that read-only backends don't try to read whatever
6843                  * garbage is left over from before.
6844                  */
6845                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6846
6847                 /*
6848                  * Likewise, delete any saved transaction snapshot files that got left
6849                  * behind by crashed backends.
6850                  */
6851                 DeleteAllExportedSnapshotFiles();
6852
6853                 /*
6854                  * Initialize for Hot Standby, if enabled. We won't let backends in
6855                  * yet, not until we've reached the min recovery point specified in
6856                  * control file and we've established a recovery snapshot from a
6857                  * running-xacts WAL record.
6858                  */
6859                 if (ArchiveRecoveryRequested && EnableHotStandby)
6860                 {
6861                         TransactionId *xids;
6862                         int                     nxids;
6863
6864                         ereport(DEBUG1,
6865                                         (errmsg("initializing for hot standby")));
6866
6867                         InitRecoveryTransactionEnvironment();
6868
6869                         if (wasShutdown)
6870                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6871                         else
6872                                 oldestActiveXID = checkPoint.oldestActiveXid;
6873                         Assert(TransactionIdIsValid(oldestActiveXID));
6874
6875                         /* Tell procarray about the range of xids it has to deal with */
6876                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6877
6878                         /*
6879                          * Startup commit log and subtrans only.  MultiXact and commit
6880                          * timestamp have already been started up and other SLRUs are not
6881                          * maintained during recovery and need not be started yet.
6882                          */
6883                         StartupCLOG();
6884                         StartupSUBTRANS(oldestActiveXID);
6885
6886                         /*
6887                          * If we're beginning at a shutdown checkpoint, we know that
6888                          * nothing was running on the master at this point. So fake-up an
6889                          * empty running-xacts record and use that here and now. Recover
6890                          * additional standby state for prepared transactions.
6891                          */
6892                         if (wasShutdown)
6893                         {
6894                                 RunningTransactionsData running;
6895                                 TransactionId latestCompletedXid;
6896
6897                                 /*
6898                                  * Construct a RunningTransactions snapshot representing a
6899                                  * shut down server, with only prepared transactions still
6900                                  * alive. We're never overflowed at this point because all
6901                                  * subxids are listed with their parent prepared transactions.
6902                                  */
6903                                 running.xcnt = nxids;
6904                                 running.subxcnt = 0;
6905                                 running.subxid_overflow = false;
6906                                 running.nextXid = checkPoint.nextXid;
6907                                 running.oldestRunningXid = oldestActiveXID;
6908                                 latestCompletedXid = checkPoint.nextXid;
6909                                 TransactionIdRetreat(latestCompletedXid);
6910                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6911                                 running.latestCompletedXid = latestCompletedXid;
6912                                 running.xids = xids;
6913
6914                                 ProcArrayApplyRecoveryInfo(&running);
6915
6916                                 StandbyRecoverPreparedTransactions(false);
6917                         }
6918                 }
6919
6920                 /* Initialize resource managers */
6921                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6922                 {
6923                         if (RmgrTable[rmid].rm_startup != NULL)
6924                                 RmgrTable[rmid].rm_startup();
6925                 }
6926
6927                 /*
6928                  * Initialize shared variables for tracking progress of WAL replay, as
6929                  * if we had just replayed the record before the REDO location (or the
6930                  * checkpoint record itself, if it's a shutdown checkpoint).
6931                  */
6932                 SpinLockAcquire(&XLogCtl->info_lck);
6933                 if (checkPoint.redo < RecPtr)
6934                         XLogCtl->replayEndRecPtr = checkPoint.redo;
6935                 else
6936                         XLogCtl->replayEndRecPtr = EndRecPtr;
6937                 XLogCtl->replayEndTLI = ThisTimeLineID;
6938                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6939                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6940                 XLogCtl->recoveryLastXTime = 0;
6941                 XLogCtl->currentChunkStartTime = 0;
6942                 XLogCtl->recoveryPause = false;
6943                 SpinLockRelease(&XLogCtl->info_lck);
6944
6945                 /* Also ensure XLogReceiptTime has a sane value */
6946                 XLogReceiptTime = GetCurrentTimestamp();
6947
6948                 /*
6949                  * Let postmaster know we've started redo now, so that it can launch
6950                  * checkpointer to perform restartpoints.  We don't bother during
6951                  * crash recovery as restartpoints can only be performed during
6952                  * archive recovery.  And we'd like to keep crash recovery simple, to
6953                  * avoid introducing bugs that could affect you when recovering after
6954                  * crash.
6955                  *
6956                  * After this point, we can no longer assume that we're the only
6957                  * process in addition to postmaster!  Also, fsync requests are
6958                  * subsequently to be handled by the checkpointer, not locally.
6959                  */
6960                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6961                 {
6962                         PublishStartupProcessInformation();
6963                         SetForwardFsyncRequests();
6964                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6965                         bgwriterLaunched = true;
6966                 }
6967
6968                 /*
6969                  * Allow read-only connections immediately if we're consistent
6970                  * already.
6971                  */
6972                 CheckRecoveryConsistency();
6973
6974                 /*
6975                  * Find the first record that logically follows the checkpoint --- it
6976                  * might physically precede it, though.
6977                  */
6978                 if (checkPoint.redo < RecPtr)
6979                 {
6980                         /* back up to find the record */
6981                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6982                 }
6983                 else
6984                 {
6985                         /* just have to read next record after CheckPoint */
6986                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6987                 }
6988
6989                 if (record != NULL)
6990                 {
6991                         ErrorContextCallback errcallback;
6992                         TimestampTz xtime;
6993
6994                         InRedo = true;
6995
6996                         ereport(LOG,
6997                                         (errmsg("redo starts at %X/%X",
6998                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6999
7000                         /*
7001                          * main redo apply loop
7002                          */
7003                         do
7004                         {
7005                                 bool            switchedTLI = false;
7006
7007 #ifdef WAL_DEBUG
7008                                 if (XLOG_DEBUG ||
7009                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7010                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7011                                 {
7012                                         StringInfoData buf;
7013
7014                                         initStringInfo(&buf);
7015                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7016                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7017                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7018                                         xlog_outrec(&buf, xlogreader);
7019                                         appendStringInfoString(&buf, " - ");
7020                                         xlog_outdesc(&buf, xlogreader);
7021                                         elog(LOG, "%s", buf.data);
7022                                         pfree(buf.data);
7023                                 }
7024 #endif
7025
7026                                 /* Handle interrupt signals of startup process */
7027                                 HandleStartupProcInterrupts();
7028
7029                                 /*
7030                                  * Pause WAL replay, if requested by a hot-standby session via
7031                                  * SetRecoveryPause().
7032                                  *
7033                                  * Note that we intentionally don't take the info_lck spinlock
7034                                  * here.  We might therefore read a slightly stale value of
7035                                  * the recoveryPause flag, but it can't be very stale (no
7036                                  * worse than the last spinlock we did acquire).  Since a
7037                                  * pause request is a pretty asynchronous thing anyway,
7038                                  * possibly responding to it one WAL record later than we
7039                                  * otherwise would is a minor issue, so it doesn't seem worth
7040                                  * adding another spinlock cycle to prevent that.
7041                                  */
7042                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7043                                         recoveryPausesHere();
7044
7045                                 /*
7046                                  * Have we reached our recovery target?
7047                                  */
7048                                 if (recoveryStopsBefore(xlogreader))
7049                                 {
7050                                         reachedStopPoint = true;        /* see below */
7051                                         break;
7052                                 }
7053
7054                                 /*
7055                                  * If we've been asked to lag the master, wait on latch until
7056                                  * enough time has passed.
7057                                  */
7058                                 if (recoveryApplyDelay(xlogreader))
7059                                 {
7060                                         /*
7061                                          * We test for paused recovery again here. If user sets
7062                                          * delayed apply, it may be because they expect to pause
7063                                          * recovery in case of problems, so we must test again
7064                                          * here otherwise pausing during the delay-wait wouldn't
7065                                          * work.
7066                                          */
7067                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7068                                                 recoveryPausesHere();
7069                                 }
7070
7071                                 /* Setup error traceback support for ereport() */
7072                                 errcallback.callback = rm_redo_error_callback;
7073                                 errcallback.arg = (void *) xlogreader;
7074                                 errcallback.previous = error_context_stack;
7075                                 error_context_stack = &errcallback;
7076
7077                                 /*
7078                                  * ShmemVariableCache->nextXid must be beyond record's xid.
7079                                  *
7080                                  * We don't expect anyone else to modify nextXid, hence we
7081                                  * don't need to hold a lock while examining it.  We still
7082                                  * acquire the lock to modify it, though.
7083                                  */
7084                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
7085                                                                                                  ShmemVariableCache->nextXid))
7086                                 {
7087                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7088                                         ShmemVariableCache->nextXid = record->xl_xid;
7089                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
7090                                         LWLockRelease(XidGenLock);
7091                                 }
7092
7093                                 /*
7094                                  * Before replaying this record, check if this record causes
7095                                  * the current timeline to change. The record is already
7096                                  * considered to be part of the new timeline, so we update
7097                                  * ThisTimeLineID before replaying it. That's important so
7098                                  * that replayEndTLI, which is recorded as the minimum
7099                                  * recovery point's TLI if recovery stops after this record,
7100                                  * is set correctly.
7101                                  */
7102                                 if (record->xl_rmid == RM_XLOG_ID)
7103                                 {
7104                                         TimeLineID      newTLI = ThisTimeLineID;
7105                                         TimeLineID      prevTLI = ThisTimeLineID;
7106                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7107
7108                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
7109                                         {
7110                                                 CheckPoint      checkPoint;
7111
7112                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7113                                                 newTLI = checkPoint.ThisTimeLineID;
7114                                                 prevTLI = checkPoint.PrevTimeLineID;
7115                                         }
7116                                         else if (info == XLOG_END_OF_RECOVERY)
7117                                         {
7118                                                 xl_end_of_recovery xlrec;
7119
7120                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7121                                                 newTLI = xlrec.ThisTimeLineID;
7122                                                 prevTLI = xlrec.PrevTimeLineID;
7123                                         }
7124
7125                                         if (newTLI != ThisTimeLineID)
7126                                         {
7127                                                 /* Check that it's OK to switch to this TLI */
7128                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7129
7130                                                 /* Following WAL records should be run with new TLI */
7131                                                 ThisTimeLineID = newTLI;
7132                                                 switchedTLI = true;
7133                                         }
7134                                 }
7135
7136                                 /*
7137                                  * Update shared replayEndRecPtr before replaying this record,
7138                                  * so that XLogFlush will update minRecoveryPoint correctly.
7139                                  */
7140                                 SpinLockAcquire(&XLogCtl->info_lck);
7141                                 XLogCtl->replayEndRecPtr = EndRecPtr;
7142                                 XLogCtl->replayEndTLI = ThisTimeLineID;
7143                                 SpinLockRelease(&XLogCtl->info_lck);
7144
7145                                 /*
7146                                  * If we are attempting to enter Hot Standby mode, process
7147                                  * XIDs we see
7148                                  */
7149                                 if (standbyState >= STANDBY_INITIALIZED &&
7150                                         TransactionIdIsValid(record->xl_xid))
7151                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7152
7153                                 /* Now apply the WAL record itself */
7154                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7155
7156                                 /*
7157                                  * After redo, check whether the backup pages associated with
7158                                  * the WAL record are consistent with the existing pages. This
7159                                  * check is done only if consistency check is enabled for this
7160                                  * record.
7161                                  */
7162                                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7163                                         checkXLogConsistency(xlogreader);
7164
7165                                 /* Pop the error context stack */
7166                                 error_context_stack = errcallback.previous;
7167
7168                                 /*
7169                                  * Update lastReplayedEndRecPtr after this record has been
7170                                  * successfully replayed.
7171                                  */
7172                                 SpinLockAcquire(&XLogCtl->info_lck);
7173                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7174                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7175                                 SpinLockRelease(&XLogCtl->info_lck);
7176
7177                                 /*
7178                                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
7179                                  * up the receiver so that it notices the updated
7180                                  * lastReplayedEndRecPtr and sends a reply to the master.
7181                                  */
7182                                 if (doRequestWalReceiverReply)
7183                                 {
7184                                         doRequestWalReceiverReply = false;
7185                                         WalRcvForceReply();
7186                                 }
7187
7188                                 /* Remember this record as the last-applied one */
7189                                 LastRec = ReadRecPtr;
7190
7191                                 /* Allow read-only connections if we're consistent now */
7192                                 CheckRecoveryConsistency();
7193
7194                                 /* Is this a timeline switch? */
7195                                 if (switchedTLI)
7196                                 {
7197                                         /*
7198                                          * Before we continue on the new timeline, clean up any
7199                                          * (possibly bogus) future WAL segments on the old
7200                                          * timeline.
7201                                          */
7202                                         RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7203
7204                                         /*
7205                                          * Wake up any walsenders to notice that we are on a new
7206                                          * timeline.
7207                                          */
7208                                         if (switchedTLI && AllowCascadeReplication())
7209                                                 WalSndWakeup();
7210                                 }
7211
7212                                 /* Exit loop if we reached inclusive recovery target */
7213                                 if (recoveryStopsAfter(xlogreader))
7214                                 {
7215                                         reachedStopPoint = true;
7216                                         break;
7217                                 }
7218
7219                                 /* Else, try to fetch the next WAL record */
7220                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7221                         } while (record != NULL);
7222
7223                         /*
7224                          * end of main redo apply loop
7225                          */
7226
7227                         if (reachedStopPoint)
7228                         {
7229                                 if (!reachedConsistency)
7230                                         ereport(FATAL,
7231                                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7232
7233                                 /*
7234                                  * This is the last point where we can restart recovery with a
7235                                  * new recovery target, if we shutdown and begin again. After
7236                                  * this, Resource Managers may choose to do permanent
7237                                  * corrective actions at end of recovery.
7238                                  */
7239                                 switch (recoveryTargetAction)
7240                                 {
7241                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
7242
7243                                                 /*
7244                                                  * exit with special return code to request shutdown
7245                                                  * of postmaster.  Log messages issued from
7246                                                  * postmaster.
7247                                                  */
7248                                                 proc_exit(3);
7249
7250                                         case RECOVERY_TARGET_ACTION_PAUSE:
7251                                                 SetRecoveryPause(true);
7252                                                 recoveryPausesHere();
7253
7254                                                 /* drop into promote */
7255
7256                                         case RECOVERY_TARGET_ACTION_PROMOTE:
7257                                                 break;
7258                                 }
7259                         }
7260
7261                         /* Allow resource managers to do any required cleanup. */
7262                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7263                         {
7264                                 if (RmgrTable[rmid].rm_cleanup != NULL)
7265                                         RmgrTable[rmid].rm_cleanup();
7266                         }
7267
7268                         ereport(LOG,
7269                                         (errmsg("redo done at %X/%X",
7270                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7271                         xtime = GetLatestXTime();
7272                         if (xtime)
7273                                 ereport(LOG,
7274                                          (errmsg("last completed transaction was at log time %s",
7275                                                          timestamptz_to_str(xtime))));
7276
7277                         InRedo = false;
7278                 }
7279                 else
7280                 {
7281                         /* there are no WAL records following the checkpoint */
7282                         ereport(LOG,
7283                                         (errmsg("redo is not required")));
7284                 }
7285         }
7286
7287         /*
7288          * Kill WAL receiver, if it's still running, before we continue to write
7289          * the startup checkpoint record. It will trump over the checkpoint and
7290          * subsequent records if it's still alive when we start writing WAL.
7291          */
7292         ShutdownWalRcv();
7293
7294         /*
7295          * Reset unlogged relations to the contents of their INIT fork. This is
7296          * done AFTER recovery is complete so as to include any unlogged relations
7297          * created during recovery, but BEFORE recovery is marked as having
7298          * completed successfully. Otherwise we'd not retry if any of the post
7299          * end-of-recovery steps fail.
7300          */
7301         if (InRecovery)
7302                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7303
7304         /*
7305          * We don't need the latch anymore. It's not strictly necessary to disown
7306          * it, but let's do it for the sake of tidiness.
7307          */
7308         if (StandbyModeRequested)
7309                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7310
7311         /*
7312          * We are now done reading the xlog from stream. Turn off streaming
7313          * recovery to force fetching the files (which would be required at end of
7314          * recovery, e.g., timeline history file) from archive or pg_wal.
7315          */
7316         StandbyMode = false;
7317
7318         /*
7319          * Re-fetch the last valid or last applied record, so we can identify the
7320          * exact endpoint of what we consider the valid portion of WAL.
7321          */
7322         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7323         EndOfLog = EndRecPtr;
7324
7325         /*
7326          * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7327          * the end-of-log. It could be different from the timeline that EndOfLog
7328          * nominally belongs to, if there was a timeline switch in that segment,
7329          * and we were reading the old WAL from a segment belonging to a higher
7330          * timeline.
7331          */
7332         EndOfLogTLI = xlogreader->readPageTLI;
7333
7334         /*
7335          * Complain if we did not roll forward far enough to render the backup
7336          * dump consistent.  Note: it is indeed okay to look at the local variable
7337          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7338          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7339          * advanced beyond the WAL we processed.
7340          */
7341         if (InRecovery &&
7342                 (EndOfLog < minRecoveryPoint ||
7343                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7344         {
7345                 /*
7346                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7347                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7348                  * tried to recover from an online backup but never called
7349                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7350                  * point. However, this also happens in crash recovery, if the system
7351                  * crashes while an online backup is in progress. We must not treat
7352                  * that as an error, or the database will refuse to start up.
7353                  */
7354                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7355                 {
7356                         if (ControlFile->backupEndRequired)
7357                                 ereport(FATAL,
7358                                                 (errmsg("WAL ends before end of online backup"),
7359                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7360                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7361                                 ereport(FATAL,
7362                                                 (errmsg("WAL ends before end of online backup"),
7363                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7364                         else
7365                                 ereport(FATAL,
7366                                           (errmsg("WAL ends before consistent recovery point")));
7367                 }
7368         }
7369
7370         /*
7371          * Consider whether we need to assign a new timeline ID.
7372          *
7373          * If we are doing an archive recovery, we always assign a new ID.  This
7374          * handles a couple of issues.  If we stopped short of the end of WAL
7375          * during recovery, then we are clearly generating a new timeline and must
7376          * assign it a unique new ID.  Even if we ran to the end, modifying the
7377          * current last segment is problematic because it may result in trying to
7378          * overwrite an already-archived copy of that segment, and we encourage
7379          * DBAs to make their archive_commands reject that.  We can dodge the
7380          * problem by making the new active segment have a new timeline ID.
7381          *
7382          * In a normal crash recovery, we can just extend the timeline we were in.
7383          */
7384         PrevTimeLineID = ThisTimeLineID;
7385         if (ArchiveRecoveryRequested)
7386         {
7387                 char            reason[200];
7388
7389                 Assert(InArchiveRecovery);
7390
7391                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7392                 ereport(LOG,
7393                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7394
7395                 /*
7396                  * Create a comment for the history file to explain why and where
7397                  * timeline changed.
7398                  */
7399                 if (recoveryTarget == RECOVERY_TARGET_XID)
7400                         snprintf(reason, sizeof(reason),
7401                                          "%s transaction %u",
7402                                          recoveryStopAfter ? "after" : "before",
7403                                          recoveryStopXid);
7404                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7405                         snprintf(reason, sizeof(reason),
7406                                          "%s %s\n",
7407                                          recoveryStopAfter ? "after" : "before",
7408                                          timestamptz_to_str(recoveryStopTime));
7409                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7410                         snprintf(reason, sizeof(reason),
7411                                          "%s LSN %X/%X\n",
7412                                          recoveryStopAfter ? "after" : "before",
7413                                          (uint32 ) (recoveryStopLSN >> 32),
7414                                          (uint32) recoveryStopLSN);
7415                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7416                         snprintf(reason, sizeof(reason),
7417                                          "at restore point \"%s\"",
7418                                          recoveryStopName);
7419                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7420                         snprintf(reason, sizeof(reason), "reached consistency");
7421                 else
7422                         snprintf(reason, sizeof(reason), "no recovery target specified");
7423
7424                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7425                                                          EndRecPtr, reason);
7426         }
7427
7428         /* Save the selected TimeLineID in shared memory, too */
7429         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7430         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7431
7432         /*
7433          * We are now done reading the old WAL.  Turn off archive fetching if it
7434          * was active, and make a writable copy of the last WAL segment. (Note
7435          * that we also have a copy of the last block of the old WAL in readBuf;
7436          * we will use that below.)
7437          */
7438         if (ArchiveRecoveryRequested)
7439                 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7440
7441         /*
7442          * Prepare to write WAL starting at EndOfLog position, and init xlog
7443          * buffer cache using the block containing the last record from the
7444          * previous incarnation.
7445          */
7446         Insert = &XLogCtl->Insert;
7447         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7448         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7449
7450         /*
7451          * Tricky point here: readBuf contains the *last* block that the LastRec
7452          * record spans, not the one it starts in.  The last block is indeed the
7453          * one we want to use.
7454          */
7455         if (EndOfLog % XLOG_BLCKSZ != 0)
7456         {
7457                 char       *page;
7458                 int                     len;
7459                 int                     firstIdx;
7460                 XLogRecPtr      pageBeginPtr;
7461
7462                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7463                 Assert(readOff == pageBeginPtr % XLogSegSize);
7464
7465                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7466
7467                 /* Copy the valid part of the last block, and zero the rest */
7468                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7469                 len = EndOfLog % XLOG_BLCKSZ;
7470                 memcpy(page, xlogreader->readBuf, len);
7471                 memset(page + len, 0, XLOG_BLCKSZ - len);
7472
7473                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7474                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7475         }
7476         else
7477         {
7478                 /*
7479                  * There is no partial block to copy. Just set InitializedUpTo, and
7480                  * let the first attempt to insert a log record to initialize the next
7481                  * buffer.
7482                  */
7483                 XLogCtl->InitializedUpTo = EndOfLog;
7484         }
7485
7486         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7487
7488         XLogCtl->LogwrtResult = LogwrtResult;
7489
7490         XLogCtl->LogwrtRqst.Write = EndOfLog;
7491         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7492
7493         /* Pre-scan prepared transactions to find out the range of XIDs present */
7494         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7495
7496         /*
7497          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7498          * record before resource manager writes cleanup WAL records or checkpoint
7499          * record is written.
7500          */
7501         Insert->fullPageWrites = lastFullPageWrites;
7502         LocalSetXLogInsertAllowed();
7503         UpdateFullPageWrites();
7504         LocalXLogInsertAllowed = -1;
7505
7506         if (InRecovery)
7507         {
7508                 /*
7509                  * Perform a checkpoint to update all our recovery activity to disk.
7510                  *
7511                  * Note that we write a shutdown checkpoint rather than an on-line
7512                  * one. This is not particularly critical, but since we may be
7513                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7514                  * the rule that TLI only changes in shutdown checkpoints, which
7515                  * allows some extra error checking in xlog_redo.
7516                  *
7517                  * In fast promotion, only create a lightweight end-of-recovery record
7518                  * instead of a full checkpoint. A checkpoint is requested later,
7519                  * after we're fully out of recovery mode and already accepting
7520                  * queries.
7521                  */
7522                 if (bgwriterLaunched)
7523                 {
7524                         if (fast_promote)
7525                         {
7526                                 checkPointLoc = ControlFile->prevCheckPoint;
7527
7528                                 /*
7529                                  * Confirm the last checkpoint is available for us to recover
7530                                  * from if we fail. Note that we don't check for the secondary
7531                                  * checkpoint since that isn't available in most base backups.
7532                                  */
7533                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7534                                 if (record != NULL)
7535                                 {
7536                                         fast_promoted = true;
7537
7538                                         /*
7539                                          * Insert a special WAL record to mark the end of
7540                                          * recovery, since we aren't doing a checkpoint. That
7541                                          * means that the checkpointer process may likely be in
7542                                          * the middle of a time-smoothed restartpoint and could
7543                                          * continue to be for minutes after this. That sounds
7544                                          * strange, but the effect is roughly the same and it
7545                                          * would be stranger to try to come out of the
7546                                          * restartpoint and then checkpoint. We request a
7547                                          * checkpoint later anyway, just for safety.
7548                                          */
7549                                         CreateEndOfRecoveryRecord();
7550                                 }
7551                         }
7552
7553                         if (!fast_promoted)
7554                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7555                                                                   CHECKPOINT_IMMEDIATE |
7556                                                                   CHECKPOINT_WAIT);
7557                 }
7558                 else
7559                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7560
7561                 /*
7562                  * And finally, execute the recovery_end_command, if any.
7563                  */
7564                 if (recoveryEndCommand)
7565                         ExecuteRecoveryCommand(recoveryEndCommand,
7566                                                                    "recovery_end_command",
7567                                                                    true);
7568         }
7569
7570         if (ArchiveRecoveryRequested)
7571         {
7572                 /*
7573                  * We switched to a new timeline. Clean up segments on the old
7574                  * timeline.
7575                  *
7576                  * If there are any higher-numbered segments on the old timeline,
7577                  * remove them. They might contain valid WAL, but they might also be
7578                  * pre-allocated files containing garbage. In any case, they are not
7579                  * part of the new timeline's history so we don't need them.
7580                  */
7581                 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7582
7583                 /*
7584                  * If the switch happened in the middle of a segment, what to do with
7585                  * the last, partial segment on the old timeline? If we don't archive
7586                  * it, and the server that created the WAL never archives it either
7587                  * (e.g. because it was hit by a meteor), it will never make it to the
7588                  * archive. That's OK from our point of view, because the new segment
7589                  * that we created with the new TLI contains all the WAL from the old
7590                  * timeline up to the switch point. But if you later try to do PITR to
7591                  * the "missing" WAL on the old timeline, recovery won't find it in
7592                  * the archive. It's physically present in the new file with new TLI,
7593                  * but recovery won't look there when it's recovering to the older
7594                  * timeline. On the other hand, if we archive the partial segment, and
7595                  * the original server on that timeline is still running and archives
7596                  * the completed version of the same segment later, it will fail. (We
7597                  * used to do that in 9.4 and below, and it caused such problems).
7598                  *
7599                  * As a compromise, we rename the last segment with the .partial
7600                  * suffix, and archive it. Archive recovery will never try to read
7601                  * .partial segments, so they will normally go unused. But in the odd
7602                  * PITR case, the administrator can copy them manually to the pg_wal
7603                  * directory (removing the suffix). They can be useful in debugging,
7604                  * too.
7605                  *
7606                  * If a .done or .ready file already exists for the old timeline,
7607                  * however, we had already determined that the segment is complete, so
7608                  * we can let it be archived normally. (In particular, if it was
7609                  * restored from the archive to begin with, it's expected to have a
7610                  * .done file).
7611                  */
7612                 if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
7613                 {
7614                         char            origfname[MAXFNAMELEN];
7615                         XLogSegNo       endLogSegNo;
7616
7617                         XLByteToPrevSeg(EndOfLog, endLogSegNo);
7618                         XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
7619
7620                         if (!XLogArchiveIsReadyOrDone(origfname))
7621                         {
7622                                 char            origpath[MAXPGPATH];
7623                                 char            partialfname[MAXFNAMELEN];
7624                                 char            partialpath[MAXPGPATH];
7625
7626                                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
7627                                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7628                                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7629
7630                                 /*
7631                                  * Make sure there's no .done or .ready file for the .partial
7632                                  * file.
7633                                  */
7634                                 XLogArchiveCleanup(partialfname);
7635
7636                                 durable_rename(origpath, partialpath, ERROR);
7637                                 XLogArchiveNotify(partialfname);
7638                         }
7639                 }
7640         }
7641
7642         /*
7643          * Preallocate additional log files, if wanted.
7644          */
7645         PreallocXlogFiles(EndOfLog);
7646
7647         /*
7648          * Okay, we're officially UP.
7649          */
7650         InRecovery = false;
7651
7652         /* start the archive_timeout timer and LSN running */
7653         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7654         XLogCtl->lastSegSwitchLSN = EndOfLog;
7655
7656         /* also initialize latestCompletedXid, to nextXid - 1 */
7657         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7658         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7659         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7660         LWLockRelease(ProcArrayLock);
7661
7662         /*
7663          * Start up the commit log and subtrans, if not already done for hot
7664          * standby.  (commit timestamps are started below, if necessary.)
7665          */
7666         if (standbyState == STANDBY_DISABLED)
7667         {
7668                 StartupCLOG();
7669                 StartupSUBTRANS(oldestActiveXID);
7670         }
7671
7672         /*
7673          * Perform end of recovery actions for any SLRUs that need it.
7674          */
7675         TrimCLOG();
7676         TrimMultiXact();
7677
7678         /* Reload shared-memory state for prepared transactions */
7679         RecoverPreparedTransactions();
7680
7681         /*
7682          * Shutdown the recovery environment. This must occur after
7683          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7684          */
7685         if (standbyState != STANDBY_DISABLED)
7686                 ShutdownRecoveryTransactionEnvironment();
7687
7688         /* Shut down xlogreader */
7689         if (readFile >= 0)
7690         {
7691                 close(readFile);
7692                 readFile = -1;
7693         }
7694         XLogReaderFree(xlogreader);
7695
7696         /*
7697          * If any of the critical GUCs have changed, log them before we allow
7698          * backends to write WAL.
7699          */
7700         LocalSetXLogInsertAllowed();
7701         XLogReportParameters();
7702
7703         /*
7704          * Local WAL inserts enabled, so it's time to finish initialization of
7705          * commit timestamp.
7706          */
7707         CompleteCommitTsInitialization();
7708
7709         /*
7710          * All done with end-of-recovery actions.
7711          *
7712          * Now allow backends to write WAL and update the control file status in
7713          * consequence.  The boolean flag allowing backends to write WAL is
7714          * updated while holding ControlFileLock to prevent other backends to look
7715          * at an inconsistent state of the control file in shared memory.  There
7716          * is still a small window during which backends can write WAL and the
7717          * control file is still referring to a system not in DB_IN_PRODUCTION
7718          * state while looking at the on-disk control file.
7719          *
7720          * Also, although the boolean flag to allow WAL is probably atomic in
7721          * itself, we use the info_lck here to ensure that there are no race
7722          * conditions concerning visibility of other recent updates to shared
7723          * memory.
7724          */
7725         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7726         ControlFile->state = DB_IN_PRODUCTION;
7727         ControlFile->time = (pg_time_t) time(NULL);
7728
7729         SpinLockAcquire(&XLogCtl->info_lck);
7730         XLogCtl->SharedRecoveryInProgress = false;
7731         SpinLockRelease(&XLogCtl->info_lck);
7732
7733         UpdateControlFile();
7734         LWLockRelease(ControlFileLock);
7735
7736         /*
7737          * If there were cascading standby servers connected to us, nudge any wal
7738          * sender processes to notice that we've been promoted.
7739          */
7740         WalSndWakeup();
7741
7742         /*
7743          * If this was a fast promotion, request an (online) checkpoint now. This
7744          * isn't required for consistency, but the last restartpoint might be far
7745          * back, and in case of a crash, recovering from it might take a longer
7746          * than is appropriate now that we're not in standby mode anymore.
7747          */
7748         if (fast_promoted)
7749                 RequestCheckpoint(CHECKPOINT_FORCE);
7750 }
7751
7752 /*
7753  * Checks if recovery has reached a consistent state. When consistency is
7754  * reached and we have a valid starting standby snapshot, tell postmaster
7755  * that it can start accepting read-only connections.
7756  */
7757 static void
7758 CheckRecoveryConsistency(void)
7759 {
7760         XLogRecPtr      lastReplayedEndRecPtr;
7761
7762         /*
7763          * During crash recovery, we don't reach a consistent state until we've
7764          * replayed all the WAL.
7765          */
7766         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7767                 return;
7768
7769         /*
7770          * assume that we are called in the startup process, and hence don't need
7771          * a lock to read lastReplayedEndRecPtr
7772          */
7773         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7774
7775         /*
7776          * Have we reached the point where our base backup was completed?
7777          */
7778         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7779                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7780         {
7781                 /*
7782                  * We have reached the end of base backup, as indicated by pg_control.
7783                  * The data on disk is now consistent. Reset backupStartPoint and
7784                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7785                  * allow starting up at an earlier point even if recovery is stopped
7786                  * and restarted soon after this.
7787                  */
7788                 elog(DEBUG1, "end of backup reached");
7789
7790                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7791
7792                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7793                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7794
7795                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7796                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7797                 ControlFile->backupEndRequired = false;
7798                 UpdateControlFile();
7799
7800                 LWLockRelease(ControlFileLock);
7801         }
7802
7803         /*
7804          * Have we passed our safe starting point? Note that minRecoveryPoint is
7805          * known to be incorrectly set if ControlFile->backupEndRequired, until
7806          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7807          * minRecoveryPoint. All we know prior to that is that we're not
7808          * consistent yet.
7809          */
7810         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7811                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7812                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7813         {
7814                 /*
7815                  * Check to see if the XLOG sequence contained any unresolved
7816                  * references to uninitialized pages.
7817                  */
7818                 XLogCheckInvalidPages();
7819
7820                 reachedConsistency = true;
7821                 ereport(LOG,
7822                                 (errmsg("consistent recovery state reached at %X/%X",
7823                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7824                                                 (uint32) lastReplayedEndRecPtr)));
7825         }
7826
7827         /*
7828          * Have we got a valid starting snapshot that will allow queries to be
7829          * run? If so, we can tell postmaster that the database is consistent now,
7830          * enabling connections.
7831          */
7832         if (standbyState == STANDBY_SNAPSHOT_READY &&
7833                 !LocalHotStandbyActive &&
7834                 reachedConsistency &&
7835                 IsUnderPostmaster)
7836         {
7837                 SpinLockAcquire(&XLogCtl->info_lck);
7838                 XLogCtl->SharedHotStandbyActive = true;
7839                 SpinLockRelease(&XLogCtl->info_lck);
7840
7841                 LocalHotStandbyActive = true;
7842
7843                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7844         }
7845 }
7846
7847 /*
7848  * Is the system still in recovery?
7849  *
7850  * Unlike testing InRecovery, this works in any process that's connected to
7851  * shared memory.
7852  *
7853  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7854  * variables the first time we see that recovery is finished.
7855  */
7856 bool
7857 RecoveryInProgress(void)
7858 {
7859         /*
7860          * We check shared state each time only until we leave recovery mode. We
7861          * can't re-enter recovery, so there's no need to keep checking after the
7862          * shared variable has once been seen false.
7863          */
7864         if (!LocalRecoveryInProgress)
7865                 return false;
7866         else
7867         {
7868                 /*
7869                  * use volatile pointer to make sure we make a fresh read of the
7870                  * shared variable.
7871                  */
7872                 volatile XLogCtlData *xlogctl = XLogCtl;
7873
7874                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7875
7876                 /*
7877                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7878                  * is finished. InitPostgres() relies upon this behaviour to ensure
7879                  * that InitXLOGAccess() is called at backend startup.  (If you change
7880                  * this, see also LocalSetXLogInsertAllowed.)
7881                  */
7882                 if (!LocalRecoveryInProgress)
7883                 {
7884                         /*
7885                          * If we just exited recovery, make sure we read TimeLineID and
7886                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7887                          * weak memory ordering).
7888                          */
7889                         pg_memory_barrier();
7890                         InitXLOGAccess();
7891                 }
7892
7893                 /*
7894                  * Note: We don't need a memory barrier when we're still in recovery.
7895                  * We might exit recovery immediately after return, so the caller
7896                  * can't rely on 'true' meaning that we're still in recovery anyway.
7897                  */
7898
7899                 return LocalRecoveryInProgress;
7900         }
7901 }
7902
7903 /*
7904  * Is HotStandby active yet? This is only important in special backends
7905  * since normal backends won't ever be able to connect until this returns
7906  * true. Postmaster knows this by way of signal, not via shared memory.
7907  *
7908  * Unlike testing standbyState, this works in any process that's connected to
7909  * shared memory.  (And note that standbyState alone doesn't tell the truth
7910  * anyway.)
7911  */
7912 bool
7913 HotStandbyActive(void)
7914 {
7915         /*
7916          * We check shared state each time only until Hot Standby is active. We
7917          * can't de-activate Hot Standby, so there's no need to keep checking
7918          * after the shared variable has once been seen true.
7919          */
7920         if (LocalHotStandbyActive)
7921                 return true;
7922         else
7923         {
7924                 /* spinlock is essential on machines with weak memory ordering! */
7925                 SpinLockAcquire(&XLogCtl->info_lck);
7926                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7927                 SpinLockRelease(&XLogCtl->info_lck);
7928
7929                 return LocalHotStandbyActive;
7930         }
7931 }
7932
7933 /*
7934  * Like HotStandbyActive(), but to be used only in WAL replay code,
7935  * where we don't need to ask any other process what the state is.
7936  */
7937 bool
7938 HotStandbyActiveInReplay(void)
7939 {
7940         Assert(AmStartupProcess() || !IsPostmasterEnvironment);
7941         return LocalHotStandbyActive;
7942 }
7943
7944 /*
7945  * Is this process allowed to insert new WAL records?
7946  *
7947  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7948  * But we also have provisions for forcing the result "true" or "false"
7949  * within specific processes regardless of the global state.
7950  */
7951 bool
7952 XLogInsertAllowed(void)
7953 {
7954         /*
7955          * If value is "unconditionally true" or "unconditionally false", just
7956          * return it.  This provides the normal fast path once recovery is known
7957          * done.
7958          */
7959         if (LocalXLogInsertAllowed >= 0)
7960                 return (bool) LocalXLogInsertAllowed;
7961
7962         /*
7963          * Else, must check to see if we're still in recovery.
7964          */
7965         if (RecoveryInProgress())
7966                 return false;
7967
7968         /*
7969          * On exit from recovery, reset to "unconditionally true", since there is
7970          * no need to keep checking.
7971          */
7972         LocalXLogInsertAllowed = 1;
7973         return true;
7974 }
7975
7976 /*
7977  * Make XLogInsertAllowed() return true in the current process only.
7978  *
7979  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7980  * and even call LocalSetXLogInsertAllowed() again after that.
7981  */
7982 static void
7983 LocalSetXLogInsertAllowed(void)
7984 {
7985         Assert(LocalXLogInsertAllowed == -1);
7986         LocalXLogInsertAllowed = 1;
7987
7988         /* Initialize as RecoveryInProgress() would do when switching state */
7989         InitXLOGAccess();
7990 }
7991
7992 /*
7993  * Subroutine to try to fetch and validate a prior checkpoint record.
7994  *
7995  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7996  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7997  */
7998 static XLogRecord *
7999 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8000                                          int whichChkpt, bool report)
8001 {
8002         XLogRecord *record;
8003         uint8           info;
8004
8005         if (!XRecOffIsValid(RecPtr))
8006         {
8007                 if (!report)
8008                         return NULL;
8009
8010                 switch (whichChkpt)
8011                 {
8012                         case 1:
8013                                 ereport(LOG,
8014                                 (errmsg("invalid primary checkpoint link in control file")));
8015                                 break;
8016                         case 2:
8017                                 ereport(LOG,
8018                                                 (errmsg("invalid secondary checkpoint link in control file")));
8019                                 break;
8020                         default:
8021                                 ereport(LOG,
8022                                    (errmsg("invalid checkpoint link in backup_label file")));
8023                                 break;
8024                 }
8025                 return NULL;
8026         }
8027
8028         record = ReadRecord(xlogreader, RecPtr, LOG, true);
8029
8030         if (record == NULL)
8031         {
8032                 if (!report)
8033                         return NULL;
8034
8035                 switch (whichChkpt)
8036                 {
8037                         case 1:
8038                                 ereport(LOG,
8039                                                 (errmsg("invalid primary checkpoint record")));
8040                                 break;
8041                         case 2:
8042                                 ereport(LOG,
8043                                                 (errmsg("invalid secondary checkpoint record")));
8044                                 break;
8045                         default:
8046                                 ereport(LOG,
8047                                                 (errmsg("invalid checkpoint record")));
8048                                 break;
8049                 }
8050                 return NULL;
8051         }
8052         if (record->xl_rmid != RM_XLOG_ID)
8053         {
8054                 switch (whichChkpt)
8055                 {
8056                         case 1:
8057                                 ereport(LOG,
8058                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
8059                                 break;
8060                         case 2:
8061                                 ereport(LOG,
8062                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
8063                                 break;
8064                         default:
8065                                 ereport(LOG,
8066                                 (errmsg("invalid resource manager ID in checkpoint record")));
8067                                 break;
8068                 }
8069                 return NULL;
8070         }
8071         info = record->xl_info & ~XLR_INFO_MASK;
8072         if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8073                 info != XLOG_CHECKPOINT_ONLINE)
8074         {
8075                 switch (whichChkpt)
8076                 {
8077                         case 1:
8078                                 ereport(LOG,
8079                                    (errmsg("invalid xl_info in primary checkpoint record")));
8080                                 break;
8081                         case 2:
8082                                 ereport(LOG,
8083                                  (errmsg("invalid xl_info in secondary checkpoint record")));
8084                                 break;
8085                         default:
8086                                 ereport(LOG,
8087                                                 (errmsg("invalid xl_info in checkpoint record")));
8088                                 break;
8089                 }
8090                 return NULL;
8091         }
8092         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8093         {
8094                 switch (whichChkpt)
8095                 {
8096                         case 1:
8097                                 ereport(LOG,
8098                                         (errmsg("invalid length of primary checkpoint record")));
8099                                 break;
8100                         case 2:
8101                                 ereport(LOG,
8102                                   (errmsg("invalid length of secondary checkpoint record")));
8103                                 break;
8104                         default:
8105                                 ereport(LOG,
8106                                                 (errmsg("invalid length of checkpoint record")));
8107                                 break;
8108                 }
8109                 return NULL;
8110         }
8111         return record;
8112 }
8113
8114 /*
8115  * This must be called in a backend process before creating WAL records
8116  * (except in a standalone backend, which does StartupXLOG instead).  We need
8117  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8118  *
8119  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8120  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8121  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8122  */
8123 void
8124 InitXLOGAccess(void)
8125 {
8126         XLogCtlInsert *Insert = &XLogCtl->Insert;
8127
8128         /* ThisTimeLineID doesn't change so we need no lock to copy it */
8129         ThisTimeLineID = XLogCtl->ThisTimeLineID;
8130         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8131
8132         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8133         (void) GetRedoRecPtr();
8134         /* Also update our copy of doPageWrites. */
8135         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8136
8137         /* Also initialize the working areas for constructing WAL records */
8138         InitXLogInsert();
8139 }
8140
8141 /*
8142  * Return the current Redo pointer from shared memory.
8143  *
8144  * As a side-effect, the local RedoRecPtr copy is updated.
8145  */
8146 XLogRecPtr
8147 GetRedoRecPtr(void)
8148 {
8149         XLogRecPtr      ptr;
8150
8151         /*
8152          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8153          * grabbed a WAL insertion lock to read the master copy, someone might
8154          * update it just after we've released the lock.
8155          */
8156         SpinLockAcquire(&XLogCtl->info_lck);
8157         ptr = XLogCtl->RedoRecPtr;
8158         SpinLockRelease(&XLogCtl->info_lck);
8159
8160         if (RedoRecPtr < ptr)
8161                 RedoRecPtr = ptr;
8162
8163         return RedoRecPtr;
8164 }
8165
8166 /*
8167  * Return information needed to decide whether a modified block needs a
8168  * full-page image to be included in the WAL record.
8169  *
8170  * The returned values are cached copies from backend-private memory, and
8171  * possibly out-of-date.  XLogInsertRecord will re-check them against
8172  * up-to-date values, while holding the WAL insert lock.
8173  */
8174 void
8175 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8176 {
8177         *RedoRecPtr_p = RedoRecPtr;
8178         *doPageWrites_p = doPageWrites;
8179 }
8180
8181 /*
8182  * GetInsertRecPtr -- Returns the current insert position.
8183  *
8184  * NOTE: The value *actually* returned is the position of the last full
8185  * xlog page. It lags behind the real insert position by at most 1 page.
8186  * For that, we don't need to scan through WAL insertion locks, and an
8187  * approximation is enough for the current usage of this function.
8188  */
8189 XLogRecPtr
8190 GetInsertRecPtr(void)
8191 {
8192         XLogRecPtr      recptr;
8193
8194         SpinLockAcquire(&XLogCtl->info_lck);
8195         recptr = XLogCtl->LogwrtRqst.Write;
8196         SpinLockRelease(&XLogCtl->info_lck);
8197
8198         return recptr;
8199 }
8200
8201 /*
8202  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8203  * position known to be fsync'd to disk.
8204  */
8205 XLogRecPtr
8206 GetFlushRecPtr(void)
8207 {
8208         SpinLockAcquire(&XLogCtl->info_lck);
8209         LogwrtResult = XLogCtl->LogwrtResult;
8210         SpinLockRelease(&XLogCtl->info_lck);
8211
8212         return LogwrtResult.Flush;
8213 }
8214
8215 /*
8216  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8217  * inserted. All records not explicitly marked as unimportant are considered
8218  * important.
8219  *
8220  * The LSN is determined by computing the maximum of
8221  * WALInsertLocks[i].lastImportantAt.
8222  */
8223 XLogRecPtr
8224 GetLastImportantRecPtr(void)
8225 {
8226         XLogRecPtr      res = InvalidXLogRecPtr;
8227         int                     i;
8228
8229         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8230         {
8231                 XLogRecPtr      last_important;
8232
8233                 /*
8234                  * Need to take a lock to prevent torn reads of the LSN, which are
8235                  * possible on some of the supported platforms. WAL insert locks only
8236                  * support exclusive mode, so we have to use that.
8237                  */
8238                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8239                 last_important = WALInsertLocks[i].l.lastImportantAt;
8240                 LWLockRelease(&WALInsertLocks[i].l.lock);
8241
8242                 if (res < last_important)
8243                         res = last_important;
8244         }
8245
8246         return res;
8247 }
8248
8249 /*
8250  * Get the time and LSN of the last xlog segment switch
8251  */
8252 pg_time_t
8253 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8254 {
8255         pg_time_t       result;
8256
8257         /* Need WALWriteLock, but shared lock is sufficient */
8258         LWLockAcquire(WALWriteLock, LW_SHARED);
8259         result = XLogCtl->lastSegSwitchTime;
8260         *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8261         LWLockRelease(WALWriteLock);
8262
8263         return result;
8264 }
8265
8266 /*
8267  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8268  *
8269  * This is exported for use by code that would like to have 64-bit XIDs.
8270  * We don't really support such things, but all XIDs within the system
8271  * can be presumed "close to" the result, and thus the epoch associated
8272  * with them can be determined.
8273  */
8274 void
8275 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8276 {
8277         uint32          ckptXidEpoch;
8278         TransactionId ckptXid;
8279         TransactionId nextXid;
8280
8281         /* Must read checkpoint info first, else have race condition */
8282         SpinLockAcquire(&XLogCtl->info_lck);
8283         ckptXidEpoch = XLogCtl->ckptXidEpoch;
8284         ckptXid = XLogCtl->ckptXid;
8285         SpinLockRelease(&XLogCtl->info_lck);
8286
8287         /* Now fetch current nextXid */
8288         nextXid = ReadNewTransactionId();
8289
8290         /*
8291          * nextXid is certainly logically later than ckptXid.  So if it's
8292          * numerically less, it must have wrapped into the next epoch.
8293          */
8294         if (nextXid < ckptXid)
8295                 ckptXidEpoch++;
8296
8297         *xid = nextXid;
8298         *epoch = ckptXidEpoch;
8299 }
8300
8301 /*
8302  * This must be called ONCE during postmaster or standalone-backend shutdown
8303  */
8304 void
8305 ShutdownXLOG(int code, Datum arg)
8306 {
8307         /* Don't be chatty in standalone mode */
8308         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8309                         (errmsg("shutting down")));
8310
8311         if (RecoveryInProgress())
8312                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8313         else
8314         {
8315                 /*
8316                  * If archiving is enabled, rotate the last XLOG file so that all the
8317                  * remaining records are archived (postmaster wakes up the archiver
8318                  * process one more time at the end of shutdown). The checkpoint
8319                  * record will go to the next XLOG file and won't be archived (yet).
8320                  */
8321                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8322                         RequestXLogSwitch(false);
8323
8324                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8325         }
8326         ShutdownCLOG();
8327         ShutdownCommitTs();
8328         ShutdownSUBTRANS();
8329         ShutdownMultiXact();
8330 }
8331
8332 /*
8333  * Log start of a checkpoint.
8334  */
8335 static void
8336 LogCheckpointStart(int flags, bool restartpoint)
8337 {
8338         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8339                  restartpoint ? "restartpoint" : "checkpoint",
8340                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8341                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8342                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8343                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8344                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8345                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8346                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8347                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8348 }
8349
8350 /*
8351  * Log end of a checkpoint.
8352  */
8353 static void
8354 LogCheckpointEnd(bool restartpoint)
8355 {
8356         long            write_secs,
8357                                 sync_secs,
8358                                 total_secs,
8359                                 longest_secs,
8360                                 average_secs;
8361         int                     write_usecs,
8362                                 sync_usecs,
8363                                 total_usecs,
8364                                 longest_usecs,
8365                                 average_usecs;
8366         uint64          average_sync_time;
8367
8368         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8369
8370         TimestampDifference(CheckpointStats.ckpt_write_t,
8371                                                 CheckpointStats.ckpt_sync_t,
8372                                                 &write_secs, &write_usecs);
8373
8374         TimestampDifference(CheckpointStats.ckpt_sync_t,
8375                                                 CheckpointStats.ckpt_sync_end_t,
8376                                                 &sync_secs, &sync_usecs);
8377
8378         /* Accumulate checkpoint timing summary data, in milliseconds. */
8379         BgWriterStats.m_checkpoint_write_time +=
8380                 write_secs * 1000 + write_usecs / 1000;
8381         BgWriterStats.m_checkpoint_sync_time +=
8382                 sync_secs * 1000 + sync_usecs / 1000;
8383
8384         /*
8385          * All of the published timing statistics are accounted for.  Only
8386          * continue if a log message is to be written.
8387          */
8388         if (!log_checkpoints)
8389                 return;
8390
8391         TimestampDifference(CheckpointStats.ckpt_start_t,
8392                                                 CheckpointStats.ckpt_end_t,
8393                                                 &total_secs, &total_usecs);
8394
8395         /*
8396          * Timing values returned from CheckpointStats are in microseconds.
8397          * Convert to the second plus microsecond form that TimestampDifference
8398          * returns for homogeneous printing.
8399          */
8400         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8401         longest_usecs = CheckpointStats.ckpt_longest_sync -
8402                 (uint64) longest_secs *1000000;
8403
8404         average_sync_time = 0;
8405         if (CheckpointStats.ckpt_sync_rels > 0)
8406                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8407                         CheckpointStats.ckpt_sync_rels;
8408         average_secs = (long) (average_sync_time / 1000000);
8409         average_usecs = average_sync_time - (uint64) average_secs *1000000;
8410
8411         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8412                  "%d transaction log file(s) added, %d removed, %d recycled; "
8413                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8414                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8415                  "distance=%d kB, estimate=%d kB",
8416                  restartpoint ? "restartpoint" : "checkpoint",
8417                  CheckpointStats.ckpt_bufs_written,
8418                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8419                  CheckpointStats.ckpt_segs_added,
8420                  CheckpointStats.ckpt_segs_removed,
8421                  CheckpointStats.ckpt_segs_recycled,
8422                  write_secs, write_usecs / 1000,
8423                  sync_secs, sync_usecs / 1000,
8424                  total_secs, total_usecs / 1000,
8425                  CheckpointStats.ckpt_sync_rels,
8426                  longest_secs, longest_usecs / 1000,
8427                  average_secs, average_usecs / 1000,
8428                  (int) (PrevCheckPointDistance / 1024.0),
8429                  (int) (CheckPointDistanceEstimate / 1024.0));
8430 }
8431
8432 /*
8433  * Update the estimate of distance between checkpoints.
8434  *
8435  * The estimate is used to calculate the number of WAL segments to keep
8436  * preallocated, see XLOGFileSlop().
8437  */
8438 static void
8439 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8440 {
8441         /*
8442          * To estimate the number of segments consumed between checkpoints, keep a
8443          * moving average of the amount of WAL generated in previous checkpoint
8444          * cycles. However, if the load is bursty, with quiet periods and busy
8445          * periods, we want to cater for the peak load. So instead of a plain
8446          * moving average, let the average decline slowly if the previous cycle
8447          * used less WAL than estimated, but bump it up immediately if it used
8448          * more.
8449          *
8450          * When checkpoints are triggered by max_wal_size, this should converge to
8451          * CheckpointSegments * XLOG_SEG_SIZE,
8452          *
8453          * Note: This doesn't pay any attention to what caused the checkpoint.
8454          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8455          * starting a base backup, are counted the same as those created
8456          * automatically. The slow-decline will largely mask them out, if they are
8457          * not frequent. If they are frequent, it seems reasonable to count them
8458          * in as any others; if you issue a manual checkpoint every 5 minutes and
8459          * never let a timed checkpoint happen, it makes sense to base the
8460          * preallocation on that 5 minute interval rather than whatever
8461          * checkpoint_timeout is set to.
8462          */
8463         PrevCheckPointDistance = nbytes;
8464         if (CheckPointDistanceEstimate < nbytes)
8465                 CheckPointDistanceEstimate = nbytes;
8466         else
8467                 CheckPointDistanceEstimate =
8468                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8469 }
8470
8471 /*
8472  * Perform a checkpoint --- either during shutdown, or on-the-fly
8473  *
8474  * flags is a bitwise OR of the following:
8475  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8476  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8477  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8478  *              ignoring checkpoint_completion_target parameter.
8479  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8480  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8481  *              CHECKPOINT_END_OF_RECOVERY).
8482  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8483  *
8484  * Note: flags contains other bits, of interest here only for logging purposes.
8485  * In particular note that this routine is synchronous and does not pay
8486  * attention to CHECKPOINT_WAIT.
8487  *
8488  * If !shutdown then we are writing an online checkpoint. This is a very special
8489  * kind of operation and WAL record because the checkpoint action occurs over
8490  * a period of time yet logically occurs at just a single LSN. The logical
8491  * position of the WAL record (redo ptr) is the same or earlier than the
8492  * physical position. When we replay WAL we locate the checkpoint via its
8493  * physical position then read the redo ptr and actually start replay at the
8494  * earlier logical position. Note that we don't write *anything* to WAL at
8495  * the logical position, so that location could be any other kind of WAL record.
8496  * All of this mechanism allows us to continue working while we checkpoint.
8497  * As a result, timing of actions is critical here and be careful to note that
8498  * this function will likely take minutes to execute on a busy system.
8499  */
8500 void
8501 CreateCheckPoint(int flags)
8502 {
8503         bool            shutdown;
8504         CheckPoint      checkPoint;
8505         XLogRecPtr      recptr;
8506         XLogCtlInsert *Insert = &XLogCtl->Insert;
8507         uint32          freespace;
8508         XLogRecPtr      PriorRedoPtr;
8509         XLogRecPtr      curInsert;
8510         XLogRecPtr      last_important_lsn;
8511         VirtualTransactionId *vxids;
8512         int                     nvxids;
8513
8514         /*
8515          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8516          * issued at a different time.
8517          */
8518         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8519                 shutdown = true;
8520         else
8521                 shutdown = false;
8522
8523         /* sanity check */
8524         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8525                 elog(ERROR, "can't create a checkpoint during recovery");
8526
8527         /*
8528          * Initialize InitXLogInsert working areas before entering the critical
8529          * section.  Normally, this is done by the first call to
8530          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8531          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8532          * done below in a critical section, and InitXLogInsert cannot be called
8533          * in a critical section.
8534          */
8535         InitXLogInsert();
8536
8537         /*
8538          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8539          * (This is just pro forma, since in the present system structure there is
8540          * only one process that is allowed to issue checkpoints at any given
8541          * time.)
8542          */
8543         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8544
8545         /*
8546          * Prepare to accumulate statistics.
8547          *
8548          * Note: because it is possible for log_checkpoints to change while a
8549          * checkpoint proceeds, we always accumulate stats, even if
8550          * log_checkpoints is currently off.
8551          */
8552         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8553         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8554
8555         /*
8556          * Use a critical section to force system panic if we have trouble.
8557          */
8558         START_CRIT_SECTION();
8559
8560         if (shutdown)
8561         {
8562                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8563                 ControlFile->state = DB_SHUTDOWNING;
8564                 ControlFile->time = (pg_time_t) time(NULL);
8565                 UpdateControlFile();
8566                 LWLockRelease(ControlFileLock);
8567         }
8568
8569         /*
8570          * Let smgr prepare for checkpoint; this has to happen before we determine
8571          * the REDO pointer.  Note that smgr must not do anything that'd have to
8572          * be undone if we decide no checkpoint is needed.
8573          */
8574         smgrpreckpt();
8575
8576         /* Begin filling in the checkpoint WAL record */
8577         MemSet(&checkPoint, 0, sizeof(checkPoint));
8578         checkPoint.time = (pg_time_t) time(NULL);
8579
8580         /*
8581          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8582          * pointer. This allows us to begin accumulating changes to assemble our
8583          * starting snapshot of locks and transactions.
8584          */
8585         if (!shutdown && XLogStandbyInfoActive())
8586                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8587         else
8588                 checkPoint.oldestActiveXid = InvalidTransactionId;
8589
8590         /*
8591          * Get location of last important record before acquiring insert locks (as
8592          * GetLastImportantRecPtr() also locks WAL locks).
8593          */
8594         last_important_lsn = GetLastImportantRecPtr();
8595
8596         /*
8597          * We must block concurrent insertions while examining insert state to
8598          * determine the checkpoint REDO pointer.
8599          */
8600         WALInsertLockAcquireExclusive();
8601         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8602
8603         /*
8604          * If this isn't a shutdown or forced checkpoint, and if there has been no
8605          * WAL activity requiring a checkpoint, skip it.  The idea here is to
8606          * avoid inserting duplicate checkpoints when the system is idle.
8607          */
8608         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8609                                   CHECKPOINT_FORCE)) == 0)
8610         {
8611                 if (last_important_lsn == ControlFile->checkPoint)
8612                 {
8613                         WALInsertLockRelease();
8614                         LWLockRelease(CheckpointLock);
8615                         END_CRIT_SECTION();
8616                         ereport(DEBUG1,
8617                                         (errmsg("checkpoint skipped due to an idle system")));
8618                         return;
8619                 }
8620         }
8621
8622         /*
8623          * An end-of-recovery checkpoint is created before anyone is allowed to
8624          * write WAL. To allow us to write the checkpoint record, temporarily
8625          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8626          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8627          */
8628         if (flags & CHECKPOINT_END_OF_RECOVERY)
8629                 LocalSetXLogInsertAllowed();
8630
8631         checkPoint.ThisTimeLineID = ThisTimeLineID;
8632         if (flags & CHECKPOINT_END_OF_RECOVERY)
8633                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8634         else
8635                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8636
8637         checkPoint.fullPageWrites = Insert->fullPageWrites;
8638
8639         /*
8640          * Compute new REDO record ptr = location of next XLOG record.
8641          *
8642          * NB: this is NOT necessarily where the checkpoint record itself will be,
8643          * since other backends may insert more XLOG records while we're off doing
8644          * the buffer flush work.  Those XLOG records are logically after the
8645          * checkpoint, even though physically before it.  Got that?
8646          */
8647         freespace = INSERT_FREESPACE(curInsert);
8648         if (freespace == 0)
8649         {
8650                 if (curInsert % XLogSegSize == 0)
8651                         curInsert += SizeOfXLogLongPHD;
8652                 else
8653                         curInsert += SizeOfXLogShortPHD;
8654         }
8655         checkPoint.redo = curInsert;
8656
8657         /*
8658          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8659          * must be done while holding all the insertion locks.
8660          *
8661          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8662          * pointing past where it really needs to point.  This is okay; the only
8663          * consequence is that XLogInsert might back up whole buffers that it
8664          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8665          * XLogInserts that happen while we are dumping buffers must assume that
8666          * their buffer changes are not included in the checkpoint.
8667          */
8668         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8669
8670         /*
8671          * Now we can release the WAL insertion locks, allowing other xacts to
8672          * proceed while we are flushing disk buffers.
8673          */
8674         WALInsertLockRelease();
8675
8676         /* Update the info_lck-protected copy of RedoRecPtr as well */
8677         SpinLockAcquire(&XLogCtl->info_lck);
8678         XLogCtl->RedoRecPtr = checkPoint.redo;
8679         SpinLockRelease(&XLogCtl->info_lck);
8680
8681         /*
8682          * If enabled, log checkpoint start.  We postpone this until now so as not
8683          * to log anything if we decided to skip the checkpoint.
8684          */
8685         if (log_checkpoints)
8686                 LogCheckpointStart(flags, false);
8687
8688         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8689
8690         /*
8691          * Get the other info we need for the checkpoint record.
8692          *
8693          * We don't need to save oldestClogXid in the checkpoint, it only matters
8694          * for the short period in which clog is being truncated, and if we crash
8695          * during that we'll redo the clog truncation and fix up oldestClogXid
8696          * there.
8697          */
8698         LWLockAcquire(XidGenLock, LW_SHARED);
8699         checkPoint.nextXid = ShmemVariableCache->nextXid;
8700         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8701         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8702         LWLockRelease(XidGenLock);
8703
8704         LWLockAcquire(CommitTsLock, LW_SHARED);
8705         checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8706         checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8707         LWLockRelease(CommitTsLock);
8708
8709         /* Increase XID epoch if we've wrapped around since last checkpoint */
8710         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8711         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8712                 checkPoint.nextXidEpoch++;
8713
8714         LWLockAcquire(OidGenLock, LW_SHARED);
8715         checkPoint.nextOid = ShmemVariableCache->nextOid;
8716         if (!shutdown)
8717                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8718         LWLockRelease(OidGenLock);
8719
8720         MultiXactGetCheckptMulti(shutdown,
8721                                                          &checkPoint.nextMulti,
8722                                                          &checkPoint.nextMultiOffset,
8723                                                          &checkPoint.oldestMulti,
8724                                                          &checkPoint.oldestMultiDB);
8725
8726         /*
8727          * Having constructed the checkpoint record, ensure all shmem disk buffers
8728          * and commit-log buffers are flushed to disk.
8729          *
8730          * This I/O could fail for various reasons.  If so, we will fail to
8731          * complete the checkpoint, but there is no reason to force a system
8732          * panic. Accordingly, exit critical section while doing it.
8733          */
8734         END_CRIT_SECTION();
8735
8736         /*
8737          * In some cases there are groups of actions that must all occur on one
8738          * side or the other of a checkpoint record. Before flushing the
8739          * checkpoint record we must explicitly wait for any backend currently
8740          * performing those groups of actions.
8741          *
8742          * One example is end of transaction, so we must wait for any transactions
8743          * that are currently in commit critical sections.  If an xact inserted
8744          * its commit record into XLOG just before the REDO point, then a crash
8745          * restart from the REDO point would not replay that record, which means
8746          * that our flushing had better include the xact's update of pg_xact.  So
8747          * we wait till he's out of his commit critical section before proceeding.
8748          * See notes in RecordTransactionCommit().
8749          *
8750          * Because we've already released the insertion locks, this test is a bit
8751          * fuzzy: it is possible that we will wait for xacts we didn't really need
8752          * to wait for.  But the delay should be short and it seems better to make
8753          * checkpoint take a bit longer than to hold off insertions longer than
8754          * necessary. (In fact, the whole reason we have this issue is that xact.c
8755          * does commit record XLOG insertion and clog update as two separate steps
8756          * protected by different locks, but again that seems best on grounds of
8757          * minimizing lock contention.)
8758          *
8759          * A transaction that has not yet set delayChkpt when we look cannot be at
8760          * risk, since he's not inserted his commit record yet; and one that's
8761          * already cleared it is not at risk either, since he's done fixing clog
8762          * and we will correctly flush the update below.  So we cannot miss any
8763          * xacts we need to wait for.
8764          */
8765         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8766         if (nvxids > 0)
8767         {
8768                 do
8769                 {
8770                         pg_usleep(10000L);      /* wait for 10 msec */
8771                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8772         }
8773         pfree(vxids);
8774
8775         CheckPointGuts(checkPoint.redo, flags);
8776
8777         /*
8778          * Take a snapshot of running transactions and write this to WAL. This
8779          * allows us to reconstruct the state of running transactions during
8780          * archive recovery, if required. Skip, if this info disabled.
8781          *
8782          * If we are shutting down, or Startup process is completing crash
8783          * recovery we don't need to write running xact data.
8784          */
8785         if (!shutdown && XLogStandbyInfoActive())
8786                 LogStandbySnapshot();
8787
8788         START_CRIT_SECTION();
8789
8790         /*
8791          * Now insert the checkpoint record into XLOG.
8792          */
8793         XLogBeginInsert();
8794         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8795         recptr = XLogInsert(RM_XLOG_ID,
8796                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8797                                                 XLOG_CHECKPOINT_ONLINE);
8798
8799         XLogFlush(recptr);
8800
8801         /*
8802          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8803          * overwritten at next startup.  No-one should even try, this just allows
8804          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8805          * to just temporarily disable writing until the system has exited
8806          * recovery.
8807          */
8808         if (shutdown)
8809         {
8810                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8811                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8812                 else
8813                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8814         }
8815
8816         /*
8817          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8818          * = end of actual checkpoint record.
8819          */
8820         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8821                 ereport(PANIC,
8822                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8823
8824         /*
8825          * Remember the prior checkpoint's redo pointer, used later to determine
8826          * the point where the log can be truncated.
8827          */
8828         PriorRedoPtr = ControlFile->checkPointCopy.redo;
8829
8830         /*
8831          * Update the control file.
8832          */
8833         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8834         if (shutdown)
8835                 ControlFile->state = DB_SHUTDOWNED;
8836         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8837         ControlFile->checkPoint = ProcLastRecPtr;
8838         ControlFile->checkPointCopy = checkPoint;
8839         ControlFile->time = (pg_time_t) time(NULL);
8840         /* crash recovery should always recover to the end of WAL */
8841         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8842         ControlFile->minRecoveryPointTLI = 0;
8843
8844         /*
8845          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8846          * unused on non-shutdown checkpoints, but seems useful to store it always
8847          * for debugging purposes.
8848          */
8849         SpinLockAcquire(&XLogCtl->ulsn_lck);
8850         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8851         SpinLockRelease(&XLogCtl->ulsn_lck);
8852
8853         UpdateControlFile();
8854         LWLockRelease(ControlFileLock);
8855
8856         /* Update shared-memory copy of checkpoint XID/epoch */
8857         SpinLockAcquire(&XLogCtl->info_lck);
8858         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8859         XLogCtl->ckptXid = checkPoint.nextXid;
8860         SpinLockRelease(&XLogCtl->info_lck);
8861
8862         /*
8863          * We are now done with critical updates; no need for system panic if we
8864          * have trouble while fooling with old log segments.
8865          */
8866         END_CRIT_SECTION();
8867
8868         /*
8869          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8870          */
8871         smgrpostckpt();
8872
8873         /*
8874          * Delete old log files (those no longer needed even for previous
8875          * checkpoint or the standbys in XLOG streaming).
8876          */
8877         if (PriorRedoPtr != InvalidXLogRecPtr)
8878         {
8879                 XLogSegNo       _logSegNo;
8880
8881                 /* Update the average distance between checkpoints. */
8882                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8883
8884                 XLByteToSeg(PriorRedoPtr, _logSegNo);
8885                 KeepLogSeg(recptr, &_logSegNo);
8886                 _logSegNo--;
8887                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
8888         }
8889
8890         /*
8891          * Make more log segments if needed.  (Do this after recycling old log
8892          * segments, since that may supply some of the needed files.)
8893          */
8894         if (!shutdown)
8895                 PreallocXlogFiles(recptr);
8896
8897         /*
8898          * Truncate pg_subtrans if possible.  We can throw away all data before
8899          * the oldest XMIN of any running transaction.  No future transaction will
8900          * attempt to reference any pg_subtrans entry older than that (see Asserts
8901          * in subtrans.c).  During recovery, though, we mustn't do this because
8902          * StartupSUBTRANS hasn't been called yet.
8903          */
8904         if (!RecoveryInProgress())
8905                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
8906
8907         /* Real work is done, but log and update stats before releasing lock. */
8908         LogCheckpointEnd(false);
8909
8910         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8911                                                                          NBuffers,
8912                                                                          CheckpointStats.ckpt_segs_added,
8913                                                                          CheckpointStats.ckpt_segs_removed,
8914                                                                          CheckpointStats.ckpt_segs_recycled);
8915
8916         LWLockRelease(CheckpointLock);
8917 }
8918
8919 /*
8920  * Mark the end of recovery in WAL though without running a full checkpoint.
8921  * We can expect that a restartpoint is likely to be in progress as we
8922  * do this, though we are unwilling to wait for it to complete. So be
8923  * careful to avoid taking the CheckpointLock anywhere here.
8924  *
8925  * CreateRestartPoint() allows for the case where recovery may end before
8926  * the restartpoint completes so there is no concern of concurrent behaviour.
8927  */
8928 static void
8929 CreateEndOfRecoveryRecord(void)
8930 {
8931         xl_end_of_recovery xlrec;
8932         XLogRecPtr      recptr;
8933
8934         /* sanity check */
8935         if (!RecoveryInProgress())
8936                 elog(ERROR, "can only be used to end recovery");
8937
8938         xlrec.end_time = GetCurrentTimestamp();
8939
8940         WALInsertLockAcquireExclusive();
8941         xlrec.ThisTimeLineID = ThisTimeLineID;
8942         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8943         WALInsertLockRelease();
8944
8945         LocalSetXLogInsertAllowed();
8946
8947         START_CRIT_SECTION();
8948
8949         XLogBeginInsert();
8950         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
8951         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8952
8953         XLogFlush(recptr);
8954
8955         /*
8956          * Update the control file so that crash recovery can follow the timeline
8957          * changes to this point.
8958          */
8959         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8960         ControlFile->time = (pg_time_t) time(NULL);
8961         ControlFile->minRecoveryPoint = recptr;
8962         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8963         UpdateControlFile();
8964         LWLockRelease(ControlFileLock);
8965
8966         END_CRIT_SECTION();
8967
8968         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8969 }
8970
8971 /*
8972  * Flush all data in shared memory to disk, and fsync
8973  *
8974  * This is the common code shared between regular checkpoints and
8975  * recovery restartpoints.
8976  */
8977 static void
8978 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8979 {
8980         CheckPointCLOG();
8981         CheckPointCommitTs();
8982         CheckPointSUBTRANS();
8983         CheckPointMultiXact();
8984         CheckPointPredicate();
8985         CheckPointRelationMap();
8986         CheckPointReplicationSlots();
8987         CheckPointSnapBuild();
8988         CheckPointLogicalRewriteHeap();
8989         CheckPointBuffers(flags);       /* performs all required fsyncs */
8990         CheckPointReplicationOrigin();
8991         /* We deliberately delay 2PC checkpointing as long as possible */
8992         CheckPointTwoPhase(checkPointRedo);
8993 }
8994
8995 /*
8996  * Save a checkpoint for recovery restart if appropriate
8997  *
8998  * This function is called each time a checkpoint record is read from XLOG.
8999  * It must determine whether the checkpoint represents a safe restartpoint or
9000  * not.  If so, the checkpoint record is stashed in shared memory so that
9001  * CreateRestartPoint can consult it.  (Note that the latter function is
9002  * executed by the checkpointer, while this one will be executed by the
9003  * startup process.)
9004  */
9005 static void
9006 RecoveryRestartPoint(const CheckPoint *checkPoint)
9007 {
9008         /*
9009          * Also refrain from creating a restartpoint if we have seen any
9010          * references to non-existent pages. Restarting recovery from the
9011          * restartpoint would not see the references, so we would lose the
9012          * cross-check that the pages belonged to a relation that was dropped
9013          * later.
9014          */
9015         if (XLogHaveInvalidPages())
9016         {
9017                 elog(trace_recovery(DEBUG2),
9018                          "could not record restart point at %X/%X because there "
9019                          "are unresolved references to invalid pages",
9020                          (uint32) (checkPoint->redo >> 32),
9021                          (uint32) checkPoint->redo);
9022                 return;
9023         }
9024
9025         /*
9026          * Copy the checkpoint record to shared memory, so that checkpointer can
9027          * work out the next time it wants to perform a restartpoint.
9028          */
9029         SpinLockAcquire(&XLogCtl->info_lck);
9030         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9031         XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9032         XLogCtl->lastCheckPoint = *checkPoint;
9033         SpinLockRelease(&XLogCtl->info_lck);
9034 }
9035
9036 /*
9037  * Establish a restartpoint if possible.
9038  *
9039  * This is similar to CreateCheckPoint, but is used during WAL recovery
9040  * to establish a point from which recovery can roll forward without
9041  * replaying the entire recovery log.
9042  *
9043  * Returns true if a new restartpoint was established. We can only establish
9044  * a restartpoint if we have replayed a safe checkpoint record since last
9045  * restartpoint.
9046  */
9047 bool
9048 CreateRestartPoint(int flags)
9049 {
9050         XLogRecPtr      lastCheckPointRecPtr;
9051         XLogRecPtr      lastCheckPointEndPtr;
9052         CheckPoint      lastCheckPoint;
9053         XLogRecPtr      PriorRedoPtr;
9054         TimestampTz xtime;
9055
9056         /*
9057          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9058          * happens at a time.
9059          */
9060         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9061
9062         /* Get a local copy of the last safe checkpoint record. */
9063         SpinLockAcquire(&XLogCtl->info_lck);
9064         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9065         lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9066         lastCheckPoint = XLogCtl->lastCheckPoint;
9067         SpinLockRelease(&XLogCtl->info_lck);
9068
9069         /*
9070          * Check that we're still in recovery mode. It's ok if we exit recovery
9071          * mode after this check, the restart point is valid anyway.
9072          */
9073         if (!RecoveryInProgress())
9074         {
9075                 ereport(DEBUG2,
9076                           (errmsg("skipping restartpoint, recovery has already ended")));
9077                 LWLockRelease(CheckpointLock);
9078                 return false;
9079         }
9080
9081         /*
9082          * If the last checkpoint record we've replayed is already our last
9083          * restartpoint, we can't perform a new restart point. We still update
9084          * minRecoveryPoint in that case, so that if this is a shutdown restart
9085          * point, we won't start up earlier than before. That's not strictly
9086          * necessary, but when hot standby is enabled, it would be rather weird if
9087          * the database opened up for read-only connections at a point-in-time
9088          * before the last shutdown. Such time travel is still possible in case of
9089          * immediate shutdown, though.
9090          *
9091          * We don't explicitly advance minRecoveryPoint when we do create a
9092          * restartpoint. It's assumed that flushing the buffers will do that as a
9093          * side-effect.
9094          */
9095         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9096                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9097         {
9098                 ereport(DEBUG2,
9099                                 (errmsg("skipping restartpoint, already performed at %X/%X",
9100                                                 (uint32) (lastCheckPoint.redo >> 32),
9101                                                 (uint32) lastCheckPoint.redo)));
9102
9103                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9104                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9105                 {
9106                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9107                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9108                         ControlFile->time = (pg_time_t) time(NULL);
9109                         UpdateControlFile();
9110                         LWLockRelease(ControlFileLock);
9111                 }
9112                 LWLockRelease(CheckpointLock);
9113                 return false;
9114         }
9115
9116         /*
9117          * Update the shared RedoRecPtr so that the startup process can calculate
9118          * the number of segments replayed since last restartpoint, and request a
9119          * restartpoint if it exceeds CheckPointSegments.
9120          *
9121          * Like in CreateCheckPoint(), hold off insertions to update it, although
9122          * during recovery this is just pro forma, because no WAL insertions are
9123          * happening.
9124          */
9125         WALInsertLockAcquireExclusive();
9126         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9127         WALInsertLockRelease();
9128
9129         /* Also update the info_lck-protected copy */
9130         SpinLockAcquire(&XLogCtl->info_lck);
9131         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9132         SpinLockRelease(&XLogCtl->info_lck);
9133
9134         /*
9135          * Prepare to accumulate statistics.
9136          *
9137          * Note: because it is possible for log_checkpoints to change while a
9138          * checkpoint proceeds, we always accumulate stats, even if
9139          * log_checkpoints is currently off.
9140          */
9141         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9142         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9143
9144         if (log_checkpoints)
9145                 LogCheckpointStart(flags, true);
9146
9147         CheckPointGuts(lastCheckPoint.redo, flags);
9148
9149         /*
9150          * Remember the prior checkpoint's redo pointer, used later to determine
9151          * the point at which we can truncate the log.
9152          */
9153         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9154
9155         /*
9156          * Update pg_control, using current time.  Check that it still shows
9157          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9158          * this is a quick hack to make sure nothing really bad happens if somehow
9159          * we get here after the end-of-recovery checkpoint.
9160          */
9161         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9162         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9163                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9164         {
9165                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
9166                 ControlFile->checkPoint = lastCheckPointRecPtr;
9167                 ControlFile->checkPointCopy = lastCheckPoint;
9168                 ControlFile->time = (pg_time_t) time(NULL);
9169
9170                 /*
9171                  * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9172                  * this will have happened already while writing out dirty buffers,
9173                  * but not necessarily - e.g. because no buffers were dirtied.  We do
9174                  * this because a non-exclusive base backup uses minRecoveryPoint to
9175                  * determine which WAL files must be included in the backup, and the
9176                  * file (or files) containing the checkpoint record must be included,
9177                  * at a minimum. Note that for an ordinary restart of recovery there's
9178                  * no value in having the minimum recovery point any earlier than this
9179                  * anyway, because redo will begin just after the checkpoint record.
9180                  */
9181                 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9182                 {
9183                         ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9184                         ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9185
9186                         /* update local copy */
9187                         minRecoveryPoint = ControlFile->minRecoveryPoint;
9188                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9189                 }
9190                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9191                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9192                 UpdateControlFile();
9193         }
9194         LWLockRelease(ControlFileLock);
9195
9196         /*
9197          * Delete old log files (those no longer needed even for previous
9198          * checkpoint/restartpoint) to prevent the disk holding the xlog from
9199          * growing full.
9200          */
9201         if (PriorRedoPtr != InvalidXLogRecPtr)
9202         {
9203                 XLogRecPtr      receivePtr;
9204                 XLogRecPtr      replayPtr;
9205                 TimeLineID      replayTLI;
9206                 XLogRecPtr      endptr;
9207                 XLogSegNo       _logSegNo;
9208
9209                 /* Update the average distance between checkpoints/restartpoints. */
9210                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9211
9212                 XLByteToSeg(PriorRedoPtr, _logSegNo);
9213
9214                 /*
9215                  * Get the current end of xlog replayed or received, whichever is
9216                  * later.
9217                  */
9218                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9219                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9220                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9221
9222                 KeepLogSeg(endptr, &_logSegNo);
9223                 _logSegNo--;
9224
9225                 /*
9226                  * Try to recycle segments on a useful timeline. If we've been
9227                  * promoted since the beginning of this restartpoint, use the new
9228                  * timeline chosen at end of recovery (RecoveryInProgress() sets
9229                  * ThisTimeLineID in that case). If we're still in recovery, use the
9230                  * timeline we're currently replaying.
9231                  *
9232                  * There is no guarantee that the WAL segments will be useful on the
9233                  * current timeline; if recovery proceeds to a new timeline right
9234                  * after this, the pre-allocated WAL segments on this timeline will
9235                  * not be used, and will go wasted until recycled on the next
9236                  * restartpoint. We'll live with that.
9237                  */
9238                 if (RecoveryInProgress())
9239                         ThisTimeLineID = replayTLI;
9240
9241                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
9242
9243                 /*
9244                  * Make more log segments if needed.  (Do this after recycling old log
9245                  * segments, since that may supply some of the needed files.)
9246                  */
9247                 PreallocXlogFiles(endptr);
9248
9249                 /*
9250                  * ThisTimeLineID is normally not set when we're still in recovery.
9251                  * However, recycling/preallocating segments above needed
9252                  * ThisTimeLineID to determine which timeline to install the segments
9253                  * on. Reset it now, to restore the normal state of affairs for
9254                  * debugging purposes.
9255                  */
9256                 if (RecoveryInProgress())
9257                         ThisTimeLineID = 0;
9258         }
9259
9260         /*
9261          * Truncate pg_subtrans if possible.  We can throw away all data before
9262          * the oldest XMIN of any running transaction.  No future transaction will
9263          * attempt to reference any pg_subtrans entry older than that (see Asserts
9264          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9265          * this because StartupSUBTRANS hasn't been called yet.
9266          */
9267         if (EnableHotStandby)
9268                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9269
9270         /* Real work is done, but log and update before releasing lock. */
9271         LogCheckpointEnd(true);
9272
9273         xtime = GetLatestXTime();
9274         ereport((log_checkpoints ? LOG : DEBUG2),
9275                         (errmsg("recovery restart point at %X/%X",
9276                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9277                    xtime ? errdetail("last completed transaction was at log time %s",
9278                                                          timestamptz_to_str(xtime)) : 0));
9279
9280         LWLockRelease(CheckpointLock);
9281
9282         /*
9283          * Finally, execute archive_cleanup_command, if any.
9284          */
9285         if (XLogCtl->archiveCleanupCommand[0])
9286                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9287                                                            "archive_cleanup_command",
9288                                                            false);
9289
9290         return true;
9291 }
9292
9293 /*
9294  * Retreat *logSegNo to the last segment that we need to retain because of
9295  * either wal_keep_segments or replication slots.
9296  *
9297  * This is calculated by subtracting wal_keep_segments from the given xlog
9298  * location, recptr and by making sure that that result is below the
9299  * requirement of replication slots.
9300  */
9301 static void
9302 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9303 {
9304         XLogSegNo       segno;
9305         XLogRecPtr      keep;
9306
9307         XLByteToSeg(recptr, segno);
9308         keep = XLogGetReplicationSlotMinimumLSN();
9309
9310         /* compute limit for wal_keep_segments first */
9311         if (wal_keep_segments > 0)
9312         {
9313                 /* avoid underflow, don't go below 1 */
9314                 if (segno <= wal_keep_segments)
9315                         segno = 1;
9316                 else
9317                         segno = segno - wal_keep_segments;
9318         }
9319
9320         /* then check whether slots limit removal further */
9321         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9322         {
9323                 XLogSegNo       slotSegNo;
9324
9325                 XLByteToSeg(keep, slotSegNo);
9326
9327                 if (slotSegNo <= 0)
9328                         segno = 1;
9329                 else if (slotSegNo < segno)
9330                         segno = slotSegNo;
9331         }
9332
9333         /* don't delete WAL segments newer than the calculated segment */
9334         if (segno < *logSegNo)
9335                 *logSegNo = segno;
9336 }
9337
9338 /*
9339  * Write a NEXTOID log record
9340  */
9341 void
9342 XLogPutNextOid(Oid nextOid)
9343 {
9344         XLogBeginInsert();
9345         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9346         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9347
9348         /*
9349          * We need not flush the NEXTOID record immediately, because any of the
9350          * just-allocated OIDs could only reach disk as part of a tuple insert or
9351          * update that would have its own XLOG record that must follow the NEXTOID
9352          * record.  Therefore, the standard buffer LSN interlock applied to those
9353          * records will ensure no such OID reaches disk before the NEXTOID record
9354          * does.
9355          *
9356          * Note, however, that the above statement only covers state "within" the
9357          * database.  When we use a generated OID as a file or directory name, we
9358          * are in a sense violating the basic WAL rule, because that filesystem
9359          * change may reach disk before the NEXTOID WAL record does.  The impact
9360          * of this is that if a database crash occurs immediately afterward, we
9361          * might after restart re-generate the same OID and find that it conflicts
9362          * with the leftover file or directory.  But since for safety's sake we
9363          * always loop until finding a nonconflicting filename, this poses no real
9364          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9365          */
9366 }
9367
9368 /*
9369  * Write an XLOG SWITCH record.
9370  *
9371  * Here we just blindly issue an XLogInsert request for the record.
9372  * All the magic happens inside XLogInsert.
9373  *
9374  * The return value is either the end+1 address of the switch record,
9375  * or the end+1 address of the prior segment if we did not need to
9376  * write a switch record because we are already at segment start.
9377  */
9378 XLogRecPtr
9379 RequestXLogSwitch(bool mark_unimportant)
9380 {
9381         XLogRecPtr      RecPtr;
9382
9383         /* XLOG SWITCH has no data */
9384         XLogBeginInsert();
9385
9386         if (mark_unimportant)
9387                 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9388         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9389
9390         return RecPtr;
9391 }
9392
9393 /*
9394  * Write a RESTORE POINT record
9395  */
9396 XLogRecPtr
9397 XLogRestorePoint(const char *rpName)
9398 {
9399         XLogRecPtr      RecPtr;
9400         xl_restore_point xlrec;
9401
9402         xlrec.rp_time = GetCurrentTimestamp();
9403         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9404
9405         XLogBeginInsert();
9406         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9407
9408         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9409
9410         ereport(LOG,
9411                         (errmsg("restore point \"%s\" created at %X/%X",
9412                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9413
9414         return RecPtr;
9415 }
9416
9417 /*
9418  * Check if any of the GUC parameters that are critical for hot standby
9419  * have changed, and update the value in pg_control file if necessary.
9420  */
9421 static void
9422 XLogReportParameters(void)
9423 {
9424         if (wal_level != ControlFile->wal_level ||
9425                 wal_log_hints != ControlFile->wal_log_hints ||
9426                 MaxConnections != ControlFile->MaxConnections ||
9427                 max_worker_processes != ControlFile->max_worker_processes ||
9428                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9429                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9430                 track_commit_timestamp != ControlFile->track_commit_timestamp)
9431         {
9432                 /*
9433                  * The change in number of backend slots doesn't need to be WAL-logged
9434                  * if archiving is not enabled, as you can't start archive recovery
9435                  * with wal_level=minimal anyway. We don't really care about the
9436                  * values in pg_control either if wal_level=minimal, but seems better
9437                  * to keep them up-to-date to avoid confusion.
9438                  */
9439                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9440                 {
9441                         xl_parameter_change xlrec;
9442                         XLogRecPtr      recptr;
9443
9444                         xlrec.MaxConnections = MaxConnections;
9445                         xlrec.max_worker_processes = max_worker_processes;
9446                         xlrec.max_prepared_xacts = max_prepared_xacts;
9447                         xlrec.max_locks_per_xact = max_locks_per_xact;
9448                         xlrec.wal_level = wal_level;
9449                         xlrec.wal_log_hints = wal_log_hints;
9450                         xlrec.track_commit_timestamp = track_commit_timestamp;
9451
9452                         XLogBeginInsert();
9453                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9454
9455                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9456                         XLogFlush(recptr);
9457                 }
9458
9459                 ControlFile->MaxConnections = MaxConnections;
9460                 ControlFile->max_worker_processes = max_worker_processes;
9461                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9462                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9463                 ControlFile->wal_level = wal_level;
9464                 ControlFile->wal_log_hints = wal_log_hints;
9465                 ControlFile->track_commit_timestamp = track_commit_timestamp;
9466                 UpdateControlFile();
9467         }
9468 }
9469
9470 /*
9471  * Update full_page_writes in shared memory, and write an
9472  * XLOG_FPW_CHANGE record if necessary.
9473  *
9474  * Note: this function assumes there is no other process running
9475  * concurrently that could update it.
9476  */
9477 void
9478 UpdateFullPageWrites(void)
9479 {
9480         XLogCtlInsert *Insert = &XLogCtl->Insert;
9481
9482         /*
9483          * Do nothing if full_page_writes has not been changed.
9484          *
9485          * It's safe to check the shared full_page_writes without the lock,
9486          * because we assume that there is no concurrently running process which
9487          * can update it.
9488          */
9489         if (fullPageWrites == Insert->fullPageWrites)
9490                 return;
9491
9492         START_CRIT_SECTION();
9493
9494         /*
9495          * It's always safe to take full page images, even when not strictly
9496          * required, but not the other round. So if we're setting full_page_writes
9497          * to true, first set it true and then write the WAL record. If we're
9498          * setting it to false, first write the WAL record and then set the global
9499          * flag.
9500          */
9501         if (fullPageWrites)
9502         {
9503                 WALInsertLockAcquireExclusive();
9504                 Insert->fullPageWrites = true;
9505                 WALInsertLockRelease();
9506         }
9507
9508         /*
9509          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9510          * full_page_writes during archive recovery, if required.
9511          */
9512         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9513         {
9514                 XLogBeginInsert();
9515                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9516
9517                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9518         }
9519
9520         if (!fullPageWrites)
9521         {
9522                 WALInsertLockAcquireExclusive();
9523                 Insert->fullPageWrites = false;
9524                 WALInsertLockRelease();
9525         }
9526         END_CRIT_SECTION();
9527 }
9528
9529 /*
9530  * Check that it's OK to switch to new timeline during recovery.
9531  *
9532  * 'lsn' is the address of the shutdown checkpoint record we're about to
9533  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9534  */
9535 static void
9536 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9537 {
9538         /* Check that the record agrees on what the current (old) timeline is */
9539         if (prevTLI != ThisTimeLineID)
9540                 ereport(PANIC,
9541                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9542                                                 prevTLI, ThisTimeLineID)));
9543
9544         /*
9545          * The new timeline better be in the list of timelines we expect to see,
9546          * according to the timeline history. It should also not decrease.
9547          */
9548         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9549                 ereport(PANIC,
9550                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9551                                  newTLI, ThisTimeLineID)));
9552
9553         /*
9554          * If we have not yet reached min recovery point, and we're about to
9555          * switch to a timeline greater than the timeline of the min recovery
9556          * point: trouble. After switching to the new timeline, we could not
9557          * possibly visit the min recovery point on the correct timeline anymore.
9558          * This can happen if there is a newer timeline in the archive that
9559          * branched before the timeline the min recovery point is on, and you
9560          * attempt to do PITR to the new timeline.
9561          */
9562         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9563                 lsn < minRecoveryPoint &&
9564                 newTLI > minRecoveryPointTLI)
9565                 ereport(PANIC,
9566                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9567                                                 newTLI,
9568                                                 (uint32) (minRecoveryPoint >> 32),
9569                                                 (uint32) minRecoveryPoint,
9570                                                 minRecoveryPointTLI)));
9571
9572         /* Looks good */
9573 }
9574
9575 /*
9576  * XLOG resource manager's routines
9577  *
9578  * Definitions of info values are in include/catalog/pg_control.h, though
9579  * not all record types are related to control file updates.
9580  */
9581 void
9582 xlog_redo(XLogReaderState *record)
9583 {
9584         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9585         XLogRecPtr      lsn = record->EndRecPtr;
9586
9587         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9588         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9589                    !XLogRecHasAnyBlockRefs(record));
9590
9591         if (info == XLOG_NEXTOID)
9592         {
9593                 Oid                     nextOid;
9594
9595                 /*
9596                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9597                  * and the recorded nextOid, but that fails if the OID counter wraps
9598                  * around.  Since no OID allocation should be happening during replay
9599                  * anyway, better to just believe the record exactly.  We still take
9600                  * OidGenLock while setting the variable, just in case.
9601                  */
9602                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9603                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9604                 ShmemVariableCache->nextOid = nextOid;
9605                 ShmemVariableCache->oidCount = 0;
9606                 LWLockRelease(OidGenLock);
9607         }
9608         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9609         {
9610                 CheckPoint      checkPoint;
9611
9612                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9613                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9614                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9615                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9616                 LWLockRelease(XidGenLock);
9617                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9618                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9619                 ShmemVariableCache->oidCount = 0;
9620                 LWLockRelease(OidGenLock);
9621                 MultiXactSetNextMXact(checkPoint.nextMulti,
9622                                                           checkPoint.nextMultiOffset);
9623
9624                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9625                                                            checkPoint.oldestMultiDB);
9626                 /*
9627                  * No need to set oldestClogXid here as well; it'll be set when we
9628                  * redo an xl_clog_truncate if it changed since initialization.
9629                  */
9630                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9631
9632                 /*
9633                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9634                  * record, the backup was canceled and the end-of-backup record will
9635                  * never arrive.
9636                  */
9637                 if (ArchiveRecoveryRequested &&
9638                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9639                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9640                         ereport(PANIC,
9641                         (errmsg("online backup was canceled, recovery cannot continue")));
9642
9643                 /*
9644                  * If we see a shutdown checkpoint, we know that nothing was running
9645                  * on the master at this point. So fake-up an empty running-xacts
9646                  * record and use that here and now. Recover additional standby state
9647                  * for prepared transactions.
9648                  */
9649                 if (standbyState >= STANDBY_INITIALIZED)
9650                 {
9651                         TransactionId *xids;
9652                         int                     nxids;
9653                         TransactionId oldestActiveXID;
9654                         TransactionId latestCompletedXid;
9655                         RunningTransactionsData running;
9656
9657                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9658
9659                         /*
9660                          * Construct a RunningTransactions snapshot representing a shut
9661                          * down server, with only prepared transactions still alive. We're
9662                          * never overflowed at this point because all subxids are listed
9663                          * with their parent prepared transactions.
9664                          */
9665                         running.xcnt = nxids;
9666                         running.subxcnt = 0;
9667                         running.subxid_overflow = false;
9668                         running.nextXid = checkPoint.nextXid;
9669                         running.oldestRunningXid = oldestActiveXID;
9670                         latestCompletedXid = checkPoint.nextXid;
9671                         TransactionIdRetreat(latestCompletedXid);
9672                         Assert(TransactionIdIsNormal(latestCompletedXid));
9673                         running.latestCompletedXid = latestCompletedXid;
9674                         running.xids = xids;
9675
9676                         ProcArrayApplyRecoveryInfo(&running);
9677
9678                         StandbyRecoverPreparedTransactions(true);
9679                 }
9680
9681                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9682                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9683                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9684
9685                 /* Update shared-memory copy of checkpoint XID/epoch */
9686                 SpinLockAcquire(&XLogCtl->info_lck);
9687                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9688                 XLogCtl->ckptXid = checkPoint.nextXid;
9689                 SpinLockRelease(&XLogCtl->info_lck);
9690
9691                 /*
9692                  * We should've already switched to the new TLI before replaying this
9693                  * record.
9694                  */
9695                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9696                         ereport(PANIC,
9697                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9698                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9699
9700                 RecoveryRestartPoint(&checkPoint);
9701         }
9702         else if (info == XLOG_CHECKPOINT_ONLINE)
9703         {
9704                 CheckPoint      checkPoint;
9705
9706                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9707                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9708                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9709                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9710                                                                   checkPoint.nextXid))
9711                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9712                 LWLockRelease(XidGenLock);
9713                 /* ... but still treat OID counter as exact */
9714                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9715                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9716                 ShmemVariableCache->oidCount = 0;
9717                 LWLockRelease(OidGenLock);
9718                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9719                                                                   checkPoint.nextMultiOffset);
9720
9721                 /*
9722                  * NB: This may perform multixact truncation when replaying WAL
9723                  * generated by an older primary.
9724                  */
9725                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9726                                                            checkPoint.oldestMultiDB);
9727                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9728                                                                   checkPoint.oldestXid))
9729                         SetTransactionIdLimit(checkPoint.oldestXid,
9730                                                                   checkPoint.oldestXidDB);
9731                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9732                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9733                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9734
9735                 /* Update shared-memory copy of checkpoint XID/epoch */
9736                 SpinLockAcquire(&XLogCtl->info_lck);
9737                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9738                 XLogCtl->ckptXid = checkPoint.nextXid;
9739                 SpinLockRelease(&XLogCtl->info_lck);
9740
9741                 /* TLI should not change in an on-line checkpoint */
9742                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9743                         ereport(PANIC,
9744                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9745                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9746
9747                 RecoveryRestartPoint(&checkPoint);
9748         }
9749         else if (info == XLOG_END_OF_RECOVERY)
9750         {
9751                 xl_end_of_recovery xlrec;
9752
9753                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9754
9755                 /*
9756                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9757                  * but this case is rarer and harder to test, so the benefit doesn't
9758                  * outweigh the potential extra cost of maintenance.
9759                  */
9760
9761                 /*
9762                  * We should've already switched to the new TLI before replaying this
9763                  * record.
9764                  */
9765                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9766                         ereport(PANIC,
9767                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9768                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9769         }
9770         else if (info == XLOG_NOOP)
9771         {
9772                 /* nothing to do here */
9773         }
9774         else if (info == XLOG_SWITCH)
9775         {
9776                 /* nothing to do here */
9777         }
9778         else if (info == XLOG_RESTORE_POINT)
9779         {
9780                 /* nothing to do here */
9781         }
9782         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
9783         {
9784                 Buffer          buffer;
9785
9786                 /*
9787                  * Full-page image (FPI) records contain nothing else but a backup
9788                  * block. The block reference must include a full-page image -
9789                  * otherwise there would be no point in this record.
9790                  *
9791                  * No recovery conflicts are generated by these generic records - if a
9792                  * resource manager needs to generate conflicts, it has to define a
9793                  * separate WAL record type and redo routine.
9794                  *
9795                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
9796                  * WAL- logged because of a hint bit update. They are only generated
9797                  * when checksums are enabled. There is no difference in handling
9798                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9799                  * code just to distinguish them for statistics purposes.
9800                  */
9801                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
9802                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9803                 UnlockReleaseBuffer(buffer);
9804         }
9805         else if (info == XLOG_BACKUP_END)
9806         {
9807                 XLogRecPtr      startpoint;
9808
9809                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9810
9811                 if (ControlFile->backupStartPoint == startpoint)
9812                 {
9813                         /*
9814                          * We have reached the end of base backup, the point where
9815                          * pg_stop_backup() was done. The data on disk is now consistent.
9816                          * Reset backupStartPoint, and update minRecoveryPoint to make
9817                          * sure we don't allow starting up at an earlier point even if
9818                          * recovery is stopped and restarted soon after this.
9819                          */
9820                         elog(DEBUG1, "end of backup reached");
9821
9822                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9823
9824                         if (ControlFile->minRecoveryPoint < lsn)
9825                         {
9826                                 ControlFile->minRecoveryPoint = lsn;
9827                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9828                         }
9829                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9830                         ControlFile->backupEndRequired = false;
9831                         UpdateControlFile();
9832
9833                         LWLockRelease(ControlFileLock);
9834                 }
9835         }
9836         else if (info == XLOG_PARAMETER_CHANGE)
9837         {
9838                 xl_parameter_change xlrec;
9839
9840                 /* Update our copy of the parameters in pg_control */
9841                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9842
9843                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9844                 ControlFile->MaxConnections = xlrec.MaxConnections;
9845                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9846                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9847                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9848                 ControlFile->wal_level = xlrec.wal_level;
9849                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
9850
9851                 /*
9852                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9853                  * recover back up to this point before allowing hot standby again.
9854                  * This is important if the max_* settings are decreased, to ensure
9855                  * you don't run queries against the WAL preceding the change.
9856                  */
9857                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9858                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9859                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9860                 {
9861                         ControlFile->minRecoveryPoint = lsn;
9862                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9863                 }
9864
9865                 CommitTsParameterChange(xlrec.track_commit_timestamp,
9866                                                                 ControlFile->track_commit_timestamp);
9867                 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
9868
9869                 UpdateControlFile();
9870                 LWLockRelease(ControlFileLock);
9871
9872                 /* Check to see if any changes to max_connections give problems */
9873                 CheckRequiredParameterValues();
9874         }
9875         else if (info == XLOG_FPW_CHANGE)
9876         {
9877                 bool            fpw;
9878
9879                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9880
9881                 /*
9882                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9883                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9884                  * full_page_writes has been disabled during online backup.
9885                  */
9886                 if (!fpw)
9887                 {
9888                         SpinLockAcquire(&XLogCtl->info_lck);
9889                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
9890                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
9891                         SpinLockRelease(&XLogCtl->info_lck);
9892                 }
9893
9894                 /* Keep track of full_page_writes */
9895                 lastFullPageWrites = fpw;
9896         }
9897 }
9898
9899 #ifdef WAL_DEBUG
9900
9901 static void
9902 xlog_outrec(StringInfo buf, XLogReaderState *record)
9903 {
9904         int                     block_id;
9905
9906         appendStringInfo(buf, "prev %X/%X; xid %u",
9907                                          (uint32) (XLogRecGetPrev(record) >> 32),
9908                                          (uint32) XLogRecGetPrev(record),
9909                                          XLogRecGetXid(record));
9910
9911         appendStringInfo(buf, "; len %u",
9912                                          XLogRecGetDataLen(record));
9913
9914         /* decode block references */
9915         for (block_id = 0; block_id <= record->max_block_id; block_id++)
9916         {
9917                 RelFileNode rnode;
9918                 ForkNumber      forknum;
9919                 BlockNumber blk;
9920
9921                 if (!XLogRecHasBlockRef(record, block_id))
9922                         continue;
9923
9924                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
9925                 if (forknum != MAIN_FORKNUM)
9926                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
9927                                                          block_id,
9928                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9929                                                          forknum,
9930                                                          blk);
9931                 else
9932                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
9933                                                          block_id,
9934                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9935                                                          blk);
9936                 if (XLogRecHasBlockImage(record, block_id))
9937                         appendStringInfoString(buf, " FPW");
9938         }
9939 }
9940 #endif   /* WAL_DEBUG */
9941
9942 /*
9943  * Returns a string describing an XLogRecord, consisting of its identity
9944  * optionally followed by a colon, a space, and a further description.
9945  */
9946 static void
9947 xlog_outdesc(StringInfo buf, XLogReaderState *record)
9948 {
9949         RmgrId          rmid = XLogRecGetRmid(record);
9950         uint8           info = XLogRecGetInfo(record);
9951         const char *id;
9952
9953         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9954         appendStringInfoChar(buf, '/');
9955
9956         id = RmgrTable[rmid].rm_identify(info);
9957         if (id == NULL)
9958                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9959         else
9960                 appendStringInfo(buf, "%s: ", id);
9961
9962         RmgrTable[rmid].rm_desc(buf, record);
9963 }
9964
9965
9966 /*
9967  * Return the (possible) sync flag used for opening a file, depending on the
9968  * value of the GUC wal_sync_method.
9969  */
9970 static int
9971 get_sync_bit(int method)
9972 {
9973         int                     o_direct_flag = 0;
9974
9975         /* If fsync is disabled, never open in sync mode */
9976         if (!enableFsync)
9977                 return 0;
9978
9979         /*
9980          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9981          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9982          * disabled, otherwise the archive command or walsender process will read
9983          * the WAL soon after writing it, which is guaranteed to cause a physical
9984          * read if we bypassed the kernel cache. We also skip the
9985          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9986          * reason.
9987          *
9988          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9989          * written by walreceiver is normally read by the startup process soon
9990          * after its written. Also, walreceiver performs unaligned writes, which
9991          * don't work with O_DIRECT, so it is required for correctness too.
9992          */
9993         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9994                 o_direct_flag = PG_O_DIRECT;
9995
9996         switch (method)
9997         {
9998                         /*
9999                          * enum values for all sync options are defined even if they are
10000                          * not supported on the current platform.  But if not, they are
10001                          * not included in the enum option array, and therefore will never
10002                          * be seen here.
10003                          */
10004                 case SYNC_METHOD_FSYNC:
10005                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10006                 case SYNC_METHOD_FDATASYNC:
10007                         return 0;
10008 #ifdef OPEN_SYNC_FLAG
10009                 case SYNC_METHOD_OPEN:
10010                         return OPEN_SYNC_FLAG | o_direct_flag;
10011 #endif
10012 #ifdef OPEN_DATASYNC_FLAG
10013                 case SYNC_METHOD_OPEN_DSYNC:
10014                         return OPEN_DATASYNC_FLAG | o_direct_flag;
10015 #endif
10016                 default:
10017                         /* can't happen (unless we are out of sync with option array) */
10018                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
10019                         return 0;                       /* silence warning */
10020         }
10021 }
10022
10023 /*
10024  * GUC support
10025  */
10026 void
10027 assign_xlog_sync_method(int new_sync_method, void *extra)
10028 {
10029         if (sync_method != new_sync_method)
10030         {
10031                 /*
10032                  * To ensure that no blocks escape unsynced, force an fsync on the
10033                  * currently open log segment (if any).  Also, if the open flag is
10034                  * changing, close the log file so it will be reopened (with new flag
10035                  * bit) at next use.
10036                  */
10037                 if (openLogFile >= 0)
10038                 {
10039                         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10040                         if (pg_fsync(openLogFile) != 0)
10041                                 ereport(PANIC,
10042                                                 (errcode_for_file_access(),
10043                                                  errmsg("could not fsync log segment %s: %m",
10044                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10045                         pgstat_report_wait_end();
10046                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10047                                 XLogFileClose();
10048                 }
10049         }
10050 }
10051
10052
10053 /*
10054  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10055  *
10056  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10057  * 'log' and 'seg' are for error reporting purposes.
10058  */
10059 void
10060 issue_xlog_fsync(int fd, XLogSegNo segno)
10061 {
10062         switch (sync_method)
10063         {
10064                 case SYNC_METHOD_FSYNC:
10065                         if (pg_fsync_no_writethrough(fd) != 0)
10066                                 ereport(PANIC,
10067                                                 (errcode_for_file_access(),
10068                                                  errmsg("could not fsync log file %s: %m",
10069                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10070                         break;
10071 #ifdef HAVE_FSYNC_WRITETHROUGH
10072                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10073                         if (pg_fsync_writethrough(fd) != 0)
10074                                 ereport(PANIC,
10075                                                 (errcode_for_file_access(),
10076                                           errmsg("could not fsync write-through log file %s: %m",
10077                                                          XLogFileNameP(ThisTimeLineID, segno))));
10078                         break;
10079 #endif
10080 #ifdef HAVE_FDATASYNC
10081                 case SYNC_METHOD_FDATASYNC:
10082                         if (pg_fdatasync(fd) != 0)
10083                                 ereport(PANIC,
10084                                                 (errcode_for_file_access(),
10085                                                  errmsg("could not fdatasync log file %s: %m",
10086                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10087                         break;
10088 #endif
10089                 case SYNC_METHOD_OPEN:
10090                 case SYNC_METHOD_OPEN_DSYNC:
10091                         /* write synced it already */
10092                         break;
10093                 default:
10094                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10095                         break;
10096         }
10097 }
10098
10099 /*
10100  * Return the filename of given log segment, as a palloc'd string.
10101  */
10102 char *
10103 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10104 {
10105         char       *result = palloc(MAXFNAMELEN);
10106
10107         XLogFileName(result, tli, segno);
10108         return result;
10109 }
10110
10111 /*
10112  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10113  * function. It creates the necessary starting checkpoint and constructs the
10114  * backup label file.
10115  *
10116  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10117  * backup is started with pg_start_backup(), and there can be only one active
10118  * at a time. The backup and tablespace map files of an exclusive backup are
10119  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10120  * removed by pg_stop_backup().
10121  *
10122  * A non-exclusive backup is used for the streaming base backups (see
10123  * src/backend/replication/basebackup.c). The difference to exclusive backups
10124  * is that the backup label and tablespace map files are not written to disk.
10125  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10126  * and the caller is responsible for including them in the backup archive as
10127  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10128  * active at the same time, and they don't conflict with an exclusive backup
10129  * either.
10130  *
10131  * tblspcmapfile is required mainly for tar format in windows as native windows
10132  * utilities are not able to create symlinks while extracting files from tar.
10133  * However for consistency, the same is used for all platforms.
10134  *
10135  * needtblspcmapfile is true for the cases (exclusive backup and for
10136  * non-exclusive backup only when tar format is used for taking backup)
10137  * when backup needs to generate tablespace_map file, it is used to
10138  * embed escape character before newline character in tablespace path.
10139  *
10140  * Returns the minimum WAL position that must be present to restore from this
10141  * backup, and the corresponding timeline ID in *starttli_p.
10142  *
10143  * Every successfully started non-exclusive backup must be stopped by calling
10144  * do_pg_stop_backup() or do_pg_abort_backup().
10145  *
10146  * It is the responsibility of the caller of this function to verify the
10147  * permissions of the calling user!
10148  */
10149 XLogRecPtr
10150 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10151                                    StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
10152                                    StringInfo tblspcmapfile, bool infotbssize,
10153                                    bool needtblspcmapfile)
10154 {
10155         bool            exclusive = (labelfile == NULL);
10156         bool            backup_started_in_recovery = false;
10157         XLogRecPtr      checkpointloc;
10158         XLogRecPtr      startpoint;
10159         TimeLineID      starttli;
10160         pg_time_t       stamp_time;
10161         char            strfbuf[128];
10162         char            xlogfilename[MAXFNAMELEN];
10163         XLogSegNo       _logSegNo;
10164         struct stat stat_buf;
10165         FILE       *fp;
10166
10167         backup_started_in_recovery = RecoveryInProgress();
10168
10169         /*
10170          * Currently only non-exclusive backup can be taken during recovery.
10171          */
10172         if (backup_started_in_recovery && exclusive)
10173                 ereport(ERROR,
10174                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10175                                  errmsg("recovery is in progress"),
10176                                  errhint("WAL control functions cannot be executed during recovery.")));
10177
10178         /*
10179          * During recovery, we don't need to check WAL level. Because, if WAL
10180          * level is not sufficient, it's impossible to get here during recovery.
10181          */
10182         if (!backup_started_in_recovery && !XLogIsNeeded())
10183                 ereport(ERROR,
10184                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10185                           errmsg("WAL level not sufficient for making an online backup"),
10186                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10187
10188         if (strlen(backupidstr) > MAXPGPATH)
10189                 ereport(ERROR,
10190                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10191                                  errmsg("backup label too long (max %d bytes)",
10192                                                 MAXPGPATH)));
10193
10194         /*
10195          * Mark backup active in shared memory.  We must do full-page WAL writes
10196          * during an on-line backup even if not doing so at other times, because
10197          * it's quite possible for the backup dump to obtain a "torn" (partially
10198          * written) copy of a database page if it reads the page concurrently with
10199          * our write to the same page.  This can be fixed as long as the first
10200          * write to the page in the WAL sequence is a full-page write. Hence, we
10201          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10202          * are no dirty pages in shared memory that might get dumped while the
10203          * backup is in progress without having a corresponding WAL record.  (Once
10204          * the backup is complete, we need not force full-page writes anymore,
10205          * since we expect that any pages not modified during the backup interval
10206          * must have been correctly captured by the backup.)
10207          *
10208          * Note that forcePageWrites has no effect during an online backup from
10209          * the standby.
10210          *
10211          * We must hold all the insertion locks to change the value of
10212          * forcePageWrites, to ensure adequate interlocking against
10213          * XLogInsertRecord().
10214          */
10215         WALInsertLockAcquireExclusive();
10216         if (exclusive)
10217         {
10218                 /*
10219                  * At first, mark that we're now starting an exclusive backup,
10220                  * to ensure that there are no other sessions currently running
10221                  * pg_start_backup() or pg_stop_backup().
10222                  */
10223                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10224                 {
10225                         WALInsertLockRelease();
10226                         ereport(ERROR,
10227                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10228                                          errmsg("a backup is already in progress"),
10229                                          errhint("Run pg_stop_backup() and try again.")));
10230                 }
10231                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10232         }
10233         else
10234                 XLogCtl->Insert.nonExclusiveBackups++;
10235         XLogCtl->Insert.forcePageWrites = true;
10236         WALInsertLockRelease();
10237
10238         /* Ensure we release forcePageWrites if fail below */
10239         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10240         {
10241                 bool            gotUniqueStartpoint = false;
10242                 struct dirent *de;
10243                 tablespaceinfo *ti;
10244                 int                     datadirpathlen;
10245
10246                 /*
10247                  * Force an XLOG file switch before the checkpoint, to ensure that the
10248                  * WAL segment the checkpoint is written to doesn't contain pages with
10249                  * old timeline IDs.  That would otherwise happen if you called
10250                  * pg_start_backup() right after restoring from a PITR archive: the
10251                  * first WAL segment containing the startup checkpoint has pages in
10252                  * the beginning with the old timeline ID.  That can cause trouble at
10253                  * recovery: we won't have a history file covering the old timeline if
10254                  * pg_wal directory was not included in the base backup and the WAL
10255                  * archive was cleared too before starting the backup.
10256                  *
10257                  * This also ensures that we have emitted a WAL page header that has
10258                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10259                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
10260                  * compress out removable backup blocks, it won't remove any that
10261                  * occur after this point.
10262                  *
10263                  * During recovery, we skip forcing XLOG file switch, which means that
10264                  * the backup taken during recovery is not available for the special
10265                  * recovery case described above.
10266                  */
10267                 if (!backup_started_in_recovery)
10268                         RequestXLogSwitch(false);
10269
10270                 do
10271                 {
10272                         bool            checkpointfpw;
10273
10274                         /*
10275                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10276                          * page problems, this guarantees that two successive backup runs
10277                          * will have different checkpoint positions and hence different
10278                          * history file names, even if nothing happened in between.
10279                          *
10280                          * During recovery, establish a restartpoint if possible. We use
10281                          * the last restartpoint as the backup starting checkpoint. This
10282                          * means that two successive backup runs can have same checkpoint
10283                          * positions.
10284                          *
10285                          * Since the fact that we are executing do_pg_start_backup()
10286                          * during recovery means that checkpointer is running, we can use
10287                          * RequestCheckpoint() to establish a restartpoint.
10288                          *
10289                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10290                          * passing fast = true).  Otherwise this can take awhile.
10291                          */
10292                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10293                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10294
10295                         /*
10296                          * Now we need to fetch the checkpoint record location, and also
10297                          * its REDO pointer.  The oldest point in WAL that would be needed
10298                          * to restore starting from the checkpoint is precisely the REDO
10299                          * pointer.
10300                          */
10301                         LWLockAcquire(ControlFileLock, LW_SHARED);
10302                         checkpointloc = ControlFile->checkPoint;
10303                         startpoint = ControlFile->checkPointCopy.redo;
10304                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10305                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10306                         LWLockRelease(ControlFileLock);
10307
10308                         if (backup_started_in_recovery)
10309                         {
10310                                 XLogRecPtr      recptr;
10311
10312                                 /*
10313                                  * Check to see if all WAL replayed during online backup
10314                                  * (i.e., since last restartpoint used as backup starting
10315                                  * checkpoint) contain full-page writes.
10316                                  */
10317                                 SpinLockAcquire(&XLogCtl->info_lck);
10318                                 recptr = XLogCtl->lastFpwDisableRecPtr;
10319                                 SpinLockRelease(&XLogCtl->info_lck);
10320
10321                                 if (!checkpointfpw || startpoint <= recptr)
10322                                         ereport(ERROR,
10323                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10324                                                    errmsg("WAL generated with full_page_writes=off was replayed "
10325                                                                   "since last restartpoint"),
10326                                                    errhint("This means that the backup being taken on the standby "
10327                                                                    "is corrupt and should not be used. "
10328                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
10329                                                                    "and then try an online backup again.")));
10330
10331                                 /*
10332                                  * During recovery, since we don't use the end-of-backup WAL
10333                                  * record and don't write the backup history file, the
10334                                  * starting WAL location doesn't need to be unique. This means
10335                                  * that two base backups started at the same time might use
10336                                  * the same checkpoint as starting locations.
10337                                  */
10338                                 gotUniqueStartpoint = true;
10339                         }
10340
10341                         /*
10342                          * If two base backups are started at the same time (in WAL sender
10343                          * processes), we need to make sure that they use different
10344                          * checkpoints as starting locations, because we use the starting
10345                          * WAL location as a unique identifier for the base backup in the
10346                          * end-of-backup WAL record and when we write the backup history
10347                          * file. Perhaps it would be better generate a separate unique ID
10348                          * for each backup instead of forcing another checkpoint, but
10349                          * taking a checkpoint right after another is not that expensive
10350                          * either because only few buffers have been dirtied yet.
10351                          */
10352                         WALInsertLockAcquireExclusive();
10353                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10354                         {
10355                                 XLogCtl->Insert.lastBackupStart = startpoint;
10356                                 gotUniqueStartpoint = true;
10357                         }
10358                         WALInsertLockRelease();
10359                 } while (!gotUniqueStartpoint);
10360
10361                 XLByteToSeg(startpoint, _logSegNo);
10362                 XLogFileName(xlogfilename, starttli, _logSegNo);
10363
10364                 /*
10365                  * Construct tablespace_map file
10366                  */
10367                 if (exclusive)
10368                         tblspcmapfile = makeStringInfo();
10369
10370                 datadirpathlen = strlen(DataDir);
10371
10372                 /* Collect information about all tablespaces */
10373                 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10374                 {
10375                         char            fullpath[MAXPGPATH];
10376                         char            linkpath[MAXPGPATH];
10377                         char       *relpath = NULL;
10378                         int                     rllen;
10379                         StringInfoData buflinkpath;
10380                         char       *s = linkpath;
10381
10382                         /* Skip special stuff */
10383                         if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10384                                 continue;
10385
10386                         snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10387
10388 #if defined(HAVE_READLINK) || defined(WIN32)
10389                         rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10390                         if (rllen < 0)
10391                         {
10392                                 ereport(WARNING,
10393                                                 (errmsg("could not read symbolic link \"%s\": %m",
10394                                                                 fullpath)));
10395                                 continue;
10396                         }
10397                         else if (rllen >= sizeof(linkpath))
10398                         {
10399                                 ereport(WARNING,
10400                                                 (errmsg("symbolic link \"%s\" target is too long",
10401                                                                 fullpath)));
10402                                 continue;
10403                         }
10404                         linkpath[rllen] = '\0';
10405
10406                         /*
10407                          * Add the escape character '\\' before newline in a string to
10408                          * ensure that we can distinguish between the newline in the
10409                          * tablespace path and end of line while reading tablespace_map
10410                          * file during archive recovery.
10411                          */
10412                         initStringInfo(&buflinkpath);
10413
10414                         while (*s)
10415                         {
10416                                 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10417                                         appendStringInfoChar(&buflinkpath, '\\');
10418                                 appendStringInfoChar(&buflinkpath, *s++);
10419                         }
10420
10421
10422                         /*
10423                          * Relpath holds the relative path of the tablespace directory
10424                          * when it's located within PGDATA, or NULL if it's located
10425                          * elsewhere.
10426                          */
10427                         if (rllen > datadirpathlen &&
10428                                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10429                                 IS_DIR_SEP(linkpath[datadirpathlen]))
10430                                 relpath = linkpath + datadirpathlen + 1;
10431
10432                         ti = palloc(sizeof(tablespaceinfo));
10433                         ti->oid = pstrdup(de->d_name);
10434                         ti->path = pstrdup(buflinkpath.data);
10435                         ti->rpath = relpath ? pstrdup(relpath) : NULL;
10436                         ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10437
10438                         if (tablespaces)
10439                                 *tablespaces = lappend(*tablespaces, ti);
10440
10441                         appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10442
10443                         pfree(buflinkpath.data);
10444 #else
10445
10446                         /*
10447                          * If the platform does not have symbolic links, it should not be
10448                          * possible to have tablespaces - clearly somebody else created
10449                          * them. Warn about it and ignore.
10450                          */
10451                         ereport(WARNING,
10452                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10453                                   errmsg("tablespaces are not supported on this platform")));
10454 #endif
10455                 }
10456
10457                 /*
10458                  * Construct backup label file
10459                  */
10460                 if (exclusive)
10461                         labelfile = makeStringInfo();
10462
10463                 /* Use the log timezone here, not the session timezone */
10464                 stamp_time = (pg_time_t) time(NULL);
10465                 pg_strftime(strfbuf, sizeof(strfbuf),
10466                                         "%Y-%m-%d %H:%M:%S %Z",
10467                                         pg_localtime(&stamp_time, log_timezone));
10468                 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10469                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10470                 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10471                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10472                 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10473                                                  exclusive ? "pg_start_backup" : "streamed");
10474                 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10475                                                  backup_started_in_recovery ? "standby" : "master");
10476                 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10477                 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10478
10479                 /*
10480                  * Okay, write the file, or return its contents to caller.
10481                  */
10482                 if (exclusive)
10483                 {
10484                         /*
10485                          * Check for existing backup label --- implies a backup is already
10486                          * running.  (XXX given that we checked exclusiveBackupState above,
10487                          * maybe it would be OK to just unlink any such label file?)
10488                          */
10489                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10490                         {
10491                                 if (errno != ENOENT)
10492                                         ereport(ERROR,
10493                                                         (errcode_for_file_access(),
10494                                                          errmsg("could not stat file \"%s\": %m",
10495                                                                         BACKUP_LABEL_FILE)));
10496                         }
10497                         else
10498                                 ereport(ERROR,
10499                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10500                                                  errmsg("a backup is already in progress"),
10501                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10502                                                                  BACKUP_LABEL_FILE)));
10503
10504                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10505
10506                         if (!fp)
10507                                 ereport(ERROR,
10508                                                 (errcode_for_file_access(),
10509                                                  errmsg("could not create file \"%s\": %m",
10510                                                                 BACKUP_LABEL_FILE)));
10511                         if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10512                                 fflush(fp) != 0 ||
10513                                 pg_fsync(fileno(fp)) != 0 ||
10514                                 ferror(fp) ||
10515                                 FreeFile(fp))
10516                                 ereport(ERROR,
10517                                                 (errcode_for_file_access(),
10518                                                  errmsg("could not write file \"%s\": %m",
10519                                                                 BACKUP_LABEL_FILE)));
10520                         /* Allocated locally for exclusive backups, so free separately */
10521                         pfree(labelfile->data);
10522                         pfree(labelfile);
10523
10524                         /* Write backup tablespace_map file. */
10525                         if (tblspcmapfile->len > 0)
10526                         {
10527                                 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10528                                 {
10529                                         if (errno != ENOENT)
10530                                                 ereport(ERROR,
10531                                                                 (errcode_for_file_access(),
10532                                                                  errmsg("could not stat file \"%s\": %m",
10533                                                                                 TABLESPACE_MAP)));
10534                                 }
10535                                 else
10536                                         ereport(ERROR,
10537                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10538                                                    errmsg("a backup is already in progress"),
10539                                                    errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10540                                                                    TABLESPACE_MAP)));
10541
10542                                 fp = AllocateFile(TABLESPACE_MAP, "w");
10543
10544                                 if (!fp)
10545                                         ereport(ERROR,
10546                                                         (errcode_for_file_access(),
10547                                                          errmsg("could not create file \"%s\": %m",
10548                                                                         TABLESPACE_MAP)));
10549                                 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10550                                         fflush(fp) != 0 ||
10551                                         pg_fsync(fileno(fp)) != 0 ||
10552                                         ferror(fp) ||
10553                                         FreeFile(fp))
10554                                         ereport(ERROR,
10555                                                         (errcode_for_file_access(),
10556                                                          errmsg("could not write file \"%s\": %m",
10557                                                                         TABLESPACE_MAP)));
10558                         }
10559
10560                         /* Allocated locally for exclusive backups, so free separately */
10561                         pfree(tblspcmapfile->data);
10562                         pfree(tblspcmapfile);
10563                 }
10564         }
10565         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10566
10567         /*
10568          * Mark that start phase has correctly finished for an exclusive backup.
10569          */
10570         if (exclusive)
10571         {
10572                 WALInsertLockAcquireExclusive();
10573                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10574                 WALInsertLockRelease();
10575         }
10576
10577         /*
10578          * We're done.  As a convenience, return the starting WAL location.
10579          */
10580         if (starttli_p)
10581                 *starttli_p = starttli;
10582         return startpoint;
10583 }
10584
10585 /* Error cleanup callback for pg_start_backup */
10586 static void
10587 pg_start_backup_callback(int code, Datum arg)
10588 {
10589         bool            exclusive = DatumGetBool(arg);
10590
10591         /* Update backup counters and forcePageWrites on failure */
10592         WALInsertLockAcquireExclusive();
10593         if (exclusive)
10594         {
10595                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10596                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10597         }
10598         else
10599         {
10600                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10601                 XLogCtl->Insert.nonExclusiveBackups--;
10602         }
10603
10604         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10605                 XLogCtl->Insert.nonExclusiveBackups == 0)
10606         {
10607                 XLogCtl->Insert.forcePageWrites = false;
10608         }
10609         WALInsertLockRelease();
10610 }
10611
10612 /*
10613  * Error cleanup callback for pg_stop_backup
10614  */
10615 static void
10616 pg_stop_backup_callback(int code, Datum arg)
10617 {
10618         bool            exclusive = DatumGetBool(arg);
10619
10620         /* Update backup status on failure */
10621         WALInsertLockAcquireExclusive();
10622         if (exclusive)
10623         {
10624                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10625                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10626         }
10627         WALInsertLockRelease();
10628 }
10629
10630 /*
10631  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10632  * function.
10633  *
10634  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10635  * the non-exclusive backup specified by 'labelfile'.
10636  *
10637  * Returns the last WAL position that must be present to restore from this
10638  * backup, and the corresponding timeline ID in *stoptli_p.
10639  *
10640  * It is the responsibility of the caller of this function to verify the
10641  * permissions of the calling user!
10642  */
10643 XLogRecPtr
10644 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10645 {
10646         bool            exclusive = (labelfile == NULL);
10647         bool            backup_started_in_recovery = false;
10648         XLogRecPtr      startpoint;
10649         XLogRecPtr      stoppoint;
10650         TimeLineID      stoptli;
10651         pg_time_t       stamp_time;
10652         char            strfbuf[128];
10653         char            histfilepath[MAXPGPATH];
10654         char            startxlogfilename[MAXFNAMELEN];
10655         char            stopxlogfilename[MAXFNAMELEN];
10656         char            lastxlogfilename[MAXFNAMELEN];
10657         char            histfilename[MAXFNAMELEN];
10658         char            backupfrom[20];
10659         XLogSegNo       _logSegNo;
10660         FILE       *lfp;
10661         FILE       *fp;
10662         char            ch;
10663         int                     seconds_before_warning;
10664         int                     waits = 0;
10665         bool            reported_waiting = false;
10666         char       *remaining;
10667         char       *ptr;
10668         uint32          hi,
10669                                 lo;
10670
10671         backup_started_in_recovery = RecoveryInProgress();
10672
10673         /*
10674          * Currently only non-exclusive backup can be taken during recovery.
10675          */
10676         if (backup_started_in_recovery && exclusive)
10677                 ereport(ERROR,
10678                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10679                                  errmsg("recovery is in progress"),
10680                                  errhint("WAL control functions cannot be executed during recovery.")));
10681
10682         /*
10683          * During recovery, we don't need to check WAL level. Because, if WAL
10684          * level is not sufficient, it's impossible to get here during recovery.
10685          */
10686         if (!backup_started_in_recovery && !XLogIsNeeded())
10687                 ereport(ERROR,
10688                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10689                           errmsg("WAL level not sufficient for making an online backup"),
10690                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10691
10692         if (exclusive)
10693         {
10694                 /*
10695                  * At first, mark that we're now stopping an exclusive backup,
10696                  * to ensure that there are no other sessions currently running
10697                  * pg_start_backup() or pg_stop_backup().
10698                  */
10699                 WALInsertLockAcquireExclusive();
10700                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10701                 {
10702                         WALInsertLockRelease();
10703                         ereport(ERROR,
10704                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10705                                          errmsg("exclusive backup not in progress")));
10706                 }
10707                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10708                 WALInsertLockRelease();
10709
10710                 /*
10711                  * Remove backup_label. In case of failure, the state for an exclusive
10712                  * backup is switched back to in-progress.
10713                  */
10714                 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10715                 {
10716                         /*
10717                          * Read the existing label file into memory.
10718                          */
10719                         struct stat statbuf;
10720                         int                     r;
10721
10722                         if (stat(BACKUP_LABEL_FILE, &statbuf))
10723                         {
10724                                 /* should not happen per the upper checks */
10725                                 if (errno != ENOENT)
10726                                         ereport(ERROR,
10727                                                         (errcode_for_file_access(),
10728                                                          errmsg("could not stat file \"%s\": %m",
10729                                                                         BACKUP_LABEL_FILE)));
10730                                 ereport(ERROR,
10731                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10732                                                  errmsg("a backup is not in progress")));
10733                         }
10734
10735                         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10736                         if (!lfp)
10737                         {
10738                                 ereport(ERROR,
10739                                                 (errcode_for_file_access(),
10740                                                  errmsg("could not read file \"%s\": %m",
10741                                                                 BACKUP_LABEL_FILE)));
10742                         }
10743                         labelfile = palloc(statbuf.st_size + 1);
10744                         r = fread(labelfile, statbuf.st_size, 1, lfp);
10745                         labelfile[statbuf.st_size] = '\0';
10746
10747                         /*
10748                          * Close and remove the backup label file
10749                          */
10750                         if (r != 1 || ferror(lfp) || FreeFile(lfp))
10751                                 ereport(ERROR,
10752                                                 (errcode_for_file_access(),
10753                                                  errmsg("could not read file \"%s\": %m",
10754                                                                 BACKUP_LABEL_FILE)));
10755                         if (unlink(BACKUP_LABEL_FILE) != 0)
10756                                 ereport(ERROR,
10757                                                 (errcode_for_file_access(),
10758                                                  errmsg("could not remove file \"%s\": %m",
10759                                                                 BACKUP_LABEL_FILE)));
10760
10761                         /*
10762                          * Remove tablespace_map file if present, it is created only if there
10763                          * are tablespaces.
10764                          */
10765                         unlink(TABLESPACE_MAP);
10766                 }
10767                 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10768         }
10769
10770         /*
10771          * OK to update backup counters and forcePageWrites
10772          */
10773         WALInsertLockAcquireExclusive();
10774         if (exclusive)
10775         {
10776                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10777         }
10778         else
10779         {
10780                 /*
10781                  * The user-visible pg_start/stop_backup() functions that operate on
10782                  * exclusive backups can be called at any time, but for non-exclusive
10783                  * backups, it is expected that each do_pg_start_backup() call is
10784                  * matched by exactly one do_pg_stop_backup() call.
10785                  */
10786                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10787                 XLogCtl->Insert.nonExclusiveBackups--;
10788         }
10789
10790         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10791                 XLogCtl->Insert.nonExclusiveBackups == 0)
10792         {
10793                 XLogCtl->Insert.forcePageWrites = false;
10794         }
10795         WALInsertLockRelease();
10796
10797         /*
10798          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10799          * but we are not expecting any variability in the file format).
10800          */
10801         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10802                            &hi, &lo, startxlogfilename,
10803                            &ch) != 4 || ch != '\n')
10804                 ereport(ERROR,
10805                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10806                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10807         startpoint = ((uint64) hi) << 32 | lo;
10808         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10809
10810         /*
10811          * Parse the BACKUP FROM line. If we are taking an online backup from the
10812          * standby, we confirm that the standby has not been promoted during the
10813          * backup.
10814          */
10815         ptr = strstr(remaining, "BACKUP FROM:");
10816         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10817                 ereport(ERROR,
10818                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10819                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10820         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10821                 ereport(ERROR,
10822                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10823                                  errmsg("the standby was promoted during online backup"),
10824                                  errhint("This means that the backup being taken is corrupt "
10825                                                  "and should not be used. "
10826                                                  "Try taking another online backup.")));
10827
10828         /*
10829          * During recovery, we don't write an end-of-backup record. We assume that
10830          * pg_control was backed up last and its minimum recovery point can be
10831          * available as the backup end location. Since we don't have an
10832          * end-of-backup record, we use the pg_control value to check whether
10833          * we've reached the end of backup when starting recovery from this
10834          * backup. We have no way of checking if pg_control wasn't backed up last
10835          * however.
10836          *
10837          * We don't force a switch to new WAL file and wait for all the required
10838          * files to be archived. This is okay if we use the backup to start the
10839          * standby. But, if it's for an archive recovery, to ensure all the
10840          * required files are available, a user should wait for them to be
10841          * archived, or include them into the backup.
10842          *
10843          * We return the current minimum recovery point as the backup end
10844          * location. Note that it can be greater than the exact backup end
10845          * location if the minimum recovery point is updated after the backup of
10846          * pg_control. This is harmless for current uses.
10847          *
10848          * XXX currently a backup history file is for informational and debug
10849          * purposes only. It's not essential for an online backup. Furthermore,
10850          * even if it's created, it will not be archived during recovery because
10851          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10852          * backup history file during recovery.
10853          */
10854         if (backup_started_in_recovery)
10855         {
10856                 XLogRecPtr      recptr;
10857
10858                 /*
10859                  * Check to see if all WAL replayed during online backup contain
10860                  * full-page writes.
10861                  */
10862                 SpinLockAcquire(&XLogCtl->info_lck);
10863                 recptr = XLogCtl->lastFpwDisableRecPtr;
10864                 SpinLockRelease(&XLogCtl->info_lck);
10865
10866                 if (startpoint <= recptr)
10867                         ereport(ERROR,
10868                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10869                            errmsg("WAL generated with full_page_writes=off was replayed "
10870                                           "during online backup"),
10871                          errhint("This means that the backup being taken on the standby "
10872                                          "is corrupt and should not be used. "
10873                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10874                                          "and then try an online backup again.")));
10875
10876
10877                 LWLockAcquire(ControlFileLock, LW_SHARED);
10878                 stoppoint = ControlFile->minRecoveryPoint;
10879                 stoptli = ControlFile->minRecoveryPointTLI;
10880                 LWLockRelease(ControlFileLock);
10881
10882                 if (stoptli_p)
10883                         *stoptli_p = stoptli;
10884                 return stoppoint;
10885         }
10886
10887         /*
10888          * Write the backup-end xlog record
10889          */
10890         XLogBeginInsert();
10891         XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
10892         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
10893         stoptli = ThisTimeLineID;
10894
10895         /*
10896          * Force a switch to a new xlog segment file, so that the backup is valid
10897          * as soon as archiver moves out the current segment file.
10898          */
10899         RequestXLogSwitch(false);
10900
10901         XLByteToPrevSeg(stoppoint, _logSegNo);
10902         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10903
10904         /* Use the log timezone here, not the session timezone */
10905         stamp_time = (pg_time_t) time(NULL);
10906         pg_strftime(strfbuf, sizeof(strfbuf),
10907                                 "%Y-%m-%d %H:%M:%S %Z",
10908                                 pg_localtime(&stamp_time, log_timezone));
10909
10910         /*
10911          * Write the backup history file
10912          */
10913         XLByteToSeg(startpoint, _logSegNo);
10914         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10915                                                   (uint32) (startpoint % XLogSegSize));
10916         fp = AllocateFile(histfilepath, "w");
10917         if (!fp)
10918                 ereport(ERROR,
10919                                 (errcode_for_file_access(),
10920                                  errmsg("could not create file \"%s\": %m",
10921                                                 histfilepath)));
10922         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10923                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10924         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10925                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10926         /* transfer remaining lines from label to history file */
10927         fprintf(fp, "%s", remaining);
10928         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10929         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10930                 ereport(ERROR,
10931                                 (errcode_for_file_access(),
10932                                  errmsg("could not write file \"%s\": %m",
10933                                                 histfilepath)));
10934
10935         /*
10936          * Clean out any no-longer-needed history files.  As a side effect, this
10937          * will post a .ready file for the newly created history file, notifying
10938          * the archiver that history file may be archived immediately.
10939          */
10940         CleanupBackupHistory();
10941
10942         /*
10943          * If archiving is enabled, wait for all the required WAL files to be
10944          * archived before returning. If archiving isn't enabled, the required WAL
10945          * needs to be transported via streaming replication (hopefully with
10946          * wal_keep_segments set high enough), or some more exotic mechanism like
10947          * polling and copying files from pg_wal with script. We have no
10948          * knowledge of those mechanisms, so it's up to the user to ensure that he
10949          * gets all the required WAL.
10950          *
10951          * We wait until both the last WAL file filled during backup and the
10952          * history file have been archived, and assume that the alphabetic sorting
10953          * property of the WAL files ensures any earlier WAL files are safely
10954          * archived as well.
10955          *
10956          * We wait forever, since archive_command is supposed to work and we
10957          * assume the admin wanted his backup to work completely. If you don't
10958          * wish to wait, then either waitforarchive should be passed in as false,
10959          * or you can set statement_timeout.  Also, some notices are
10960          * issued to clue in anyone who might be doing this interactively.
10961          */
10962         if (waitforarchive && XLogArchivingActive())
10963         {
10964                 XLByteToPrevSeg(stoppoint, _logSegNo);
10965                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10966
10967                 XLByteToSeg(startpoint, _logSegNo);
10968                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10969                                                           (uint32) (startpoint % XLogSegSize));
10970
10971                 seconds_before_warning = 60;
10972                 waits = 0;
10973
10974                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10975                            XLogArchiveIsBusy(histfilename))
10976                 {
10977                         CHECK_FOR_INTERRUPTS();
10978
10979                         if (!reported_waiting && waits > 5)
10980                         {
10981                                 ereport(NOTICE,
10982                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10983                                 reported_waiting = true;
10984                         }
10985
10986                         pg_usleep(1000000L);
10987
10988                         if (++waits >= seconds_before_warning)
10989                         {
10990                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10991                                 ereport(WARNING,
10992                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10993                                                                 waits),
10994                                                  errhint("Check that your archive_command is executing properly.  "
10995                                                                  "pg_stop_backup can be canceled safely, "
10996                                                                  "but the database backup will not be usable without all the WAL segments.")));
10997                         }
10998                 }
10999
11000                 ereport(NOTICE,
11001                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11002         }
11003         else if (waitforarchive)
11004                 ereport(NOTICE,
11005                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11006
11007         /*
11008          * We're done.  As a convenience, return the ending WAL location.
11009          */
11010         if (stoptli_p)
11011                 *stoptli_p = stoptli;
11012         return stoppoint;
11013 }
11014
11015
11016 /*
11017  * do_pg_abort_backup: abort a running backup
11018  *
11019  * This does just the most basic steps of do_pg_stop_backup(), by taking the
11020  * system out of backup mode, thus making it a lot more safe to call from
11021  * an error handler.
11022  *
11023  * NB: This is only for aborting a non-exclusive backup that doesn't write
11024  * backup_label. A backup started with pg_start_backup() needs to be finished
11025  * with pg_stop_backup().
11026  */
11027 void
11028 do_pg_abort_backup(void)
11029 {
11030         WALInsertLockAcquireExclusive();
11031         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11032         XLogCtl->Insert.nonExclusiveBackups--;
11033
11034         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11035                 XLogCtl->Insert.nonExclusiveBackups == 0)
11036         {
11037                 XLogCtl->Insert.forcePageWrites = false;
11038         }
11039         WALInsertLockRelease();
11040 }
11041
11042 /*
11043  * Get latest redo apply position.
11044  *
11045  * Exported to allow WALReceiver to read the pointer directly.
11046  */
11047 XLogRecPtr
11048 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11049 {
11050         XLogRecPtr      recptr;
11051         TimeLineID      tli;
11052
11053         SpinLockAcquire(&XLogCtl->info_lck);
11054         recptr = XLogCtl->lastReplayedEndRecPtr;
11055         tli = XLogCtl->lastReplayedTLI;
11056         SpinLockRelease(&XLogCtl->info_lck);
11057
11058         if (replayTLI)
11059                 *replayTLI = tli;
11060         return recptr;
11061 }
11062
11063 /*
11064  * Get latest WAL insert pointer
11065  */
11066 XLogRecPtr
11067 GetXLogInsertRecPtr(void)
11068 {
11069         XLogCtlInsert *Insert = &XLogCtl->Insert;
11070         uint64          current_bytepos;
11071
11072         SpinLockAcquire(&Insert->insertpos_lck);
11073         current_bytepos = Insert->CurrBytePos;
11074         SpinLockRelease(&Insert->insertpos_lck);
11075
11076         return XLogBytePosToRecPtr(current_bytepos);
11077 }
11078
11079 /*
11080  * Get latest WAL write pointer
11081  */
11082 XLogRecPtr
11083 GetXLogWriteRecPtr(void)
11084 {
11085         SpinLockAcquire(&XLogCtl->info_lck);
11086         LogwrtResult = XLogCtl->LogwrtResult;
11087         SpinLockRelease(&XLogCtl->info_lck);
11088
11089         return LogwrtResult.Write;
11090 }
11091
11092 /*
11093  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11094  * the oldest point in WAL that we still need, if we have to restart recovery.
11095  */
11096 void
11097 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11098 {
11099         LWLockAcquire(ControlFileLock, LW_SHARED);
11100         *oldrecptr = ControlFile->checkPointCopy.redo;
11101         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11102         LWLockRelease(ControlFileLock);
11103 }
11104
11105 /*
11106  * read_backup_label: check to see if a backup_label file is present
11107  *
11108  * If we see a backup_label during recovery, we assume that we are recovering
11109  * from a backup dump file, and we therefore roll forward from the checkpoint
11110  * identified by the label file, NOT what pg_control says.  This avoids the
11111  * problem that pg_control might have been archived one or more checkpoints
11112  * later than the start of the dump, and so if we rely on it as the start
11113  * point, we will fail to restore a consistent database state.
11114  *
11115  * Returns TRUE if a backup_label was found (and fills the checkpoint
11116  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11117  * respectively); returns FALSE if not. If this backup_label came from a
11118  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
11119  * was created during recovery, *backupFromStandby is set to TRUE.
11120  */
11121 static bool
11122 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11123                                   bool *backupFromStandby)
11124 {
11125         char            startxlogfilename[MAXFNAMELEN];
11126         TimeLineID      tli;
11127         FILE       *lfp;
11128         char            ch;
11129         char            backuptype[20];
11130         char            backupfrom[20];
11131         uint32          hi,
11132                                 lo;
11133
11134         *backupEndRequired = false;
11135         *backupFromStandby = false;
11136
11137         /*
11138          * See if label file is present
11139          */
11140         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11141         if (!lfp)
11142         {
11143                 if (errno != ENOENT)
11144                         ereport(FATAL,
11145                                         (errcode_for_file_access(),
11146                                          errmsg("could not read file \"%s\": %m",
11147                                                         BACKUP_LABEL_FILE)));
11148                 return false;                   /* it's not there, all is fine */
11149         }
11150
11151         /*
11152          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11153          * is pretty crude, but we are not expecting any variability in the file
11154          * format).
11155          */
11156         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11157                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
11158                 ereport(FATAL,
11159                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11160                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11161         RedoStartLSN = ((uint64) hi) << 32 | lo;
11162         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11163                            &hi, &lo, &ch) != 3 || ch != '\n')
11164                 ereport(FATAL,
11165                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11166                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11167         *checkPointLoc = ((uint64) hi) << 32 | lo;
11168
11169         /*
11170          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11171          * from an older backup anyway, but since the information on it is not
11172          * strictly required, don't error out if it's missing for some reason.
11173          */
11174         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11175         {
11176                 if (strcmp(backuptype, "streamed") == 0)
11177                         *backupEndRequired = true;
11178         }
11179
11180         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11181         {
11182                 if (strcmp(backupfrom, "standby") == 0)
11183                         *backupFromStandby = true;
11184         }
11185
11186         if (ferror(lfp) || FreeFile(lfp))
11187                 ereport(FATAL,
11188                                 (errcode_for_file_access(),
11189                                  errmsg("could not read file \"%s\": %m",
11190                                                 BACKUP_LABEL_FILE)));
11191
11192         return true;
11193 }
11194
11195 /*
11196  * read_tablespace_map: check to see if a tablespace_map file is present
11197  *
11198  * If we see a tablespace_map file during recovery, we assume that we are
11199  * recovering from a backup dump file, and we therefore need to create symlinks
11200  * as per the information present in tablespace_map file.
11201  *
11202  * Returns TRUE if a tablespace_map file was found (and fills the link
11203  * information for all the tablespace links present in file); returns FALSE
11204  * if not.
11205  */
11206 static bool
11207 read_tablespace_map(List **tablespaces)
11208 {
11209         tablespaceinfo *ti;
11210         FILE       *lfp;
11211         char            tbsoid[MAXPGPATH];
11212         char       *tbslinkpath;
11213         char            str[MAXPGPATH];
11214         int                     ch,
11215                                 prev_ch = -1,
11216                                 i = 0,
11217                                 n;
11218
11219         /*
11220          * See if tablespace_map file is present
11221          */
11222         lfp = AllocateFile(TABLESPACE_MAP, "r");
11223         if (!lfp)
11224         {
11225                 if (errno != ENOENT)
11226                         ereport(FATAL,
11227                                         (errcode_for_file_access(),
11228                                          errmsg("could not read file \"%s\": %m",
11229                                                         TABLESPACE_MAP)));
11230                 return false;                   /* it's not there, all is fine */
11231         }
11232
11233         /*
11234          * Read and parse the link name and path lines from tablespace_map file
11235          * (this code is pretty crude, but we are not expecting any variability in
11236          * the file format).  While taking backup we embed escape character '\\'
11237          * before newline in tablespace path, so that during reading of
11238          * tablespace_map file, we could distinguish newline in tablespace path
11239          * and end of line.  Now while reading tablespace_map file, remove the
11240          * escape character that has been added in tablespace path during backup.
11241          */
11242         while ((ch = fgetc(lfp)) != EOF)
11243         {
11244                 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11245                 {
11246                         str[i] = '\0';
11247                         if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11248                                 ereport(FATAL,
11249                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11250                                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11251                         tbslinkpath = str + n;
11252                         i = 0;
11253
11254                         ti = palloc(sizeof(tablespaceinfo));
11255                         ti->oid = pstrdup(tbsoid);
11256                         ti->path = pstrdup(tbslinkpath);
11257
11258                         *tablespaces = lappend(*tablespaces, ti);
11259                         continue;
11260                 }
11261                 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11262                         str[i - 1] = ch;
11263                 else
11264                         str[i++] = ch;
11265                 prev_ch = ch;
11266         }
11267
11268         if (ferror(lfp) || FreeFile(lfp))
11269                 ereport(FATAL,
11270                                 (errcode_for_file_access(),
11271                                  errmsg("could not read file \"%s\": %m",
11272                                                 TABLESPACE_MAP)));
11273
11274         return true;
11275 }
11276
11277 /*
11278  * Error context callback for errors occurring during rm_redo().
11279  */
11280 static void
11281 rm_redo_error_callback(void *arg)
11282 {
11283         XLogReaderState *record = (XLogReaderState *) arg;
11284         StringInfoData buf;
11285
11286         initStringInfo(&buf);
11287         xlog_outdesc(&buf, record);
11288
11289         /* translator: %s is a WAL record description */
11290         errcontext("WAL redo at %X/%X for %s",
11291                            (uint32) (record->ReadRecPtr >> 32),
11292                            (uint32) record->ReadRecPtr,
11293                            buf.data);
11294
11295         pfree(buf.data);
11296 }
11297
11298 /*
11299  * BackupInProgress: check if online backup mode is active
11300  *
11301  * This is done by checking for existence of the "backup_label" file.
11302  */
11303 bool
11304 BackupInProgress(void)
11305 {
11306         struct stat stat_buf;
11307
11308         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11309 }
11310
11311 /*
11312  * CancelBackup: rename the "backup_label" and "tablespace_map"
11313  *                               files to cancel backup mode
11314  *
11315  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11316  * Similarly, if the "tablespace_map" file exists, it will be renamed to
11317  * "tablespace_map.old".
11318  *
11319  * Note that this will render an online backup in progress
11320  * useless. To correctly finish an online backup, pg_stop_backup must be
11321  * called.
11322  */
11323 void
11324 CancelBackup(void)
11325 {
11326         struct stat stat_buf;
11327
11328         /* if the backup_label file is not there, return */
11329         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11330                 return;
11331
11332         /* remove leftover file from previously canceled backup if it exists */
11333         unlink(BACKUP_LABEL_OLD);
11334
11335         if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11336         {
11337                 ereport(WARNING,
11338                                 (errcode_for_file_access(),
11339                                  errmsg("online backup mode was not canceled"),
11340                                  errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11341                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11342                 return;
11343         }
11344
11345         /* if the tablespace_map file is not there, return */
11346         if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11347         {
11348                 ereport(LOG,
11349                                 (errmsg("online backup mode canceled"),
11350                                  errdetail("File \"%s\" was renamed to \"%s\".",
11351                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11352                 return;
11353         }
11354
11355         /* remove leftover file from previously canceled backup if it exists */
11356         unlink(TABLESPACE_MAP_OLD);
11357
11358         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11359         {
11360                 ereport(LOG,
11361                                 (errmsg("online backup mode canceled"),
11362                                  errdetail("Files \"%s\" and \"%s\" were renamed to "
11363                                                    "\"%s\" and \"%s\", respectively.",
11364                                                    BACKUP_LABEL_FILE, TABLESPACE_MAP,
11365                                                    BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11366         }
11367         else
11368         {
11369                 ereport(WARNING,
11370                                 (errcode_for_file_access(),
11371                                  errmsg("online backup mode canceled"),
11372                                  errdetail("File \"%s\" was renamed to \"%s\", but "
11373                                                    "file \"%s\" could not be renamed to \"%s\": %m.",
11374                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11375                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11376         }
11377 }
11378
11379 /*
11380  * Read the XLOG page containing RecPtr into readBuf (if not read already).
11381  * Returns number of bytes read, if the page is read successfully, or -1
11382  * in case of errors.  When errors occur, they are ereport'ed, but only
11383  * if they have not been previously reported.
11384  *
11385  * This is responsible for restoring files from archive as needed, as well
11386  * as for waiting for the requested WAL record to arrive in standby mode.
11387  *
11388  * 'emode' specifies the log level used for reporting "file not found" or
11389  * "end of WAL" situations in archive recovery, or in standby mode when a
11390  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11391  * false in those situations, on higher log levels the ereport() won't
11392  * return.
11393  *
11394  * In standby mode, if after a successful return of XLogPageRead() the
11395  * caller finds the record it's interested in to be broken, it should
11396  * ereport the error with the level determined by
11397  * emode_for_corrupt_record(), and then set lastSourceFailed
11398  * and call XLogPageRead() again with the same arguments. This lets
11399  * XLogPageRead() to try fetching the record from another source, or to
11400  * sleep and retry.
11401  */
11402 static int
11403 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11404                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11405 {
11406         XLogPageReadPrivate *private =
11407         (XLogPageReadPrivate *) xlogreader->private_data;
11408         int                     emode = private->emode;
11409         uint32          targetPageOff;
11410         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11411
11412         XLByteToSeg(targetPagePtr, targetSegNo);
11413         targetPageOff = targetPagePtr % XLogSegSize;
11414
11415         /*
11416          * See if we need to switch to a new segment because the requested record
11417          * is not in the currently open one.
11418          */
11419         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
11420         {
11421                 /*
11422                  * Request a restartpoint if we've replayed too much xlog since the
11423                  * last one.
11424                  */
11425                 if (bgwriterLaunched)
11426                 {
11427                         if (XLogCheckpointNeeded(readSegNo))
11428                         {
11429                                 (void) GetRedoRecPtr();
11430                                 if (XLogCheckpointNeeded(readSegNo))
11431                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11432                         }
11433                 }
11434
11435                 close(readFile);
11436                 readFile = -1;
11437                 readSource = 0;
11438         }
11439
11440         XLByteToSeg(targetPagePtr, readSegNo);
11441
11442 retry:
11443         /* See if we need to retrieve more data */
11444         if (readFile < 0 ||
11445                 (readSource == XLOG_FROM_STREAM &&
11446                  receivedUpto < targetPagePtr + reqLen))
11447         {
11448                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11449                                                                                  private->randAccess,
11450                                                                                  private->fetching_ckpt,
11451                                                                                  targetRecPtr))
11452                 {
11453                         if (readFile >= 0)
11454                                 close(readFile);
11455                         readFile = -1;
11456                         readLen = 0;
11457                         readSource = 0;
11458
11459                         return -1;
11460                 }
11461         }
11462
11463         /*
11464          * At this point, we have the right segment open and if we're streaming we
11465          * know the requested record is in it.
11466          */
11467         Assert(readFile != -1);
11468
11469         /*
11470          * If the current segment is being streamed from master, calculate how
11471          * much of the current page we have received already. We know the
11472          * requested record has been received, but this is for the benefit of
11473          * future calls, to allow quick exit at the top of this function.
11474          */
11475         if (readSource == XLOG_FROM_STREAM)
11476         {
11477                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11478                         readLen = XLOG_BLCKSZ;
11479                 else
11480                         readLen = receivedUpto % XLogSegSize - targetPageOff;
11481         }
11482         else
11483                 readLen = XLOG_BLCKSZ;
11484
11485         /* Read the requested page */
11486         readOff = targetPageOff;
11487         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11488         {
11489                 char            fname[MAXFNAMELEN];
11490
11491                 XLogFileName(fname, curFileTLI, readSegNo);
11492                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11493                                 (errcode_for_file_access(),
11494                                  errmsg("could not seek in log segment %s to offset %u: %m",
11495                                                 fname, readOff)));
11496                 goto next_record_is_invalid;
11497         }
11498
11499         pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11500         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
11501         {
11502                 char            fname[MAXFNAMELEN];
11503
11504                 pgstat_report_wait_end();
11505                 XLogFileName(fname, curFileTLI, readSegNo);
11506                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11507                                 (errcode_for_file_access(),
11508                                  errmsg("could not read from log segment %s, offset %u: %m",
11509                                                 fname, readOff)));
11510                 goto next_record_is_invalid;
11511         }
11512         pgstat_report_wait_end();
11513
11514         Assert(targetSegNo == readSegNo);
11515         Assert(targetPageOff == readOff);
11516         Assert(reqLen <= readLen);
11517
11518         *readTLI = curFileTLI;
11519         return readLen;
11520
11521 next_record_is_invalid:
11522         lastSourceFailed = true;
11523
11524         if (readFile >= 0)
11525                 close(readFile);
11526         readFile = -1;
11527         readLen = 0;
11528         readSource = 0;
11529
11530         /* In standby-mode, keep trying */
11531         if (StandbyMode)
11532                 goto retry;
11533         else
11534                 return -1;
11535 }
11536
11537 /*
11538  * Open the WAL segment containing WAL position 'RecPtr'.
11539  *
11540  * The segment can be fetched via restore_command, or via walreceiver having
11541  * streamed the record, or it can already be present in pg_wal. Checking
11542  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11543  * too, in case someone copies a new segment directly to pg_wal. That is not
11544  * documented or recommended, though.
11545  *
11546  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11547  * prepare to read WAL starting from RedoStartLSN after this.
11548  *
11549  * 'RecPtr' might not point to the beginning of the record we're interested
11550  * in, it might also point to the page or segment header. In that case,
11551  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11552  * used to decide which timeline to stream the requested WAL from.
11553  *
11554  * If the record is not immediately available, the function returns false
11555  * if we're not in standby mode. In standby mode, waits for it to become
11556  * available.
11557  *
11558  * When the requested record becomes available, the function opens the file
11559  * containing it (if not open already), and returns true. When end of standby
11560  * mode is triggered by the user, and there is no more WAL available, returns
11561  * false.
11562  */
11563 static bool
11564 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11565                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11566 {
11567         static TimestampTz last_fail_time = 0;
11568         TimestampTz now;
11569         bool            streaming_reply_sent = false;
11570
11571         /*-------
11572          * Standby mode is implemented by a state machine:
11573          *
11574          * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11575          *        pg_wal (XLOG_FROM_PG_WAL)
11576          * 2. Check trigger file
11577          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11578          * 4. Rescan timelines
11579          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11580          *
11581          * Failure to read from the current source advances the state machine to
11582          * the next state.
11583          *
11584          * 'currentSource' indicates the current state. There are no currentSource
11585          * values for "check trigger", "rescan timelines", and "sleep" states,
11586          * those actions are taken when reading from the previous source fails, as
11587          * part of advancing to the next state.
11588          *-------
11589          */
11590         if (!InArchiveRecovery)
11591                 currentSource = XLOG_FROM_PG_WAL;
11592         else if (currentSource == 0)
11593                 currentSource = XLOG_FROM_ARCHIVE;
11594
11595         for (;;)
11596         {
11597                 int                     oldSource = currentSource;
11598
11599                 /*
11600                  * First check if we failed to read from the current source, and
11601                  * advance the state machine if so. The failure to read might've
11602                  * happened outside this function, e.g when a CRC check fails on a
11603                  * record, or within this loop.
11604                  */
11605                 if (lastSourceFailed)
11606                 {
11607                         switch (currentSource)
11608                         {
11609                                 case XLOG_FROM_ARCHIVE:
11610                                 case XLOG_FROM_PG_WAL:
11611
11612                                         /*
11613                                          * Check to see if the trigger file exists. Note that we
11614                                          * do this only after failure, so when you create the
11615                                          * trigger file, we still finish replaying as much as we
11616                                          * can from archive and pg_wal before failover.
11617                                          */
11618                                         if (StandbyMode && CheckForStandbyTrigger())
11619                                         {
11620                                                 ShutdownWalRcv();
11621                                                 return false;
11622                                         }
11623
11624                                         /*
11625                                          * Not in standby mode, and we've now tried the archive
11626                                          * and pg_wal.
11627                                          */
11628                                         if (!StandbyMode)
11629                                                 return false;
11630
11631                                         /*
11632                                          * If primary_conninfo is set, launch walreceiver to try
11633                                          * to stream the missing WAL.
11634                                          *
11635                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
11636                                          * checkpoint location. In that case, we use RedoStartLSN
11637                                          * as the streaming start position instead of RecPtr, so
11638                                          * that when we later jump backwards to start redo at
11639                                          * RedoStartLSN, we will have the logs streamed already.
11640                                          */
11641                                         if (PrimaryConnInfo)
11642                                         {
11643                                                 XLogRecPtr      ptr;
11644                                                 TimeLineID      tli;
11645
11646                                                 if (fetching_ckpt)
11647                                                 {
11648                                                         ptr = RedoStartLSN;
11649                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
11650                                                 }
11651                                                 else
11652                                                 {
11653                                                         ptr = tliRecPtr;
11654                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11655
11656                                                         if (curFileTLI > 0 && tli < curFileTLI)
11657                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11658                                                                          (uint32) (ptr >> 32), (uint32) ptr,
11659                                                                          tli, curFileTLI);
11660                                                 }
11661                                                 curFileTLI = tli;
11662                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11663                                                                                          PrimarySlotName);
11664                                                 receivedUpto = 0;
11665                                         }
11666
11667                                         /*
11668                                          * Move to XLOG_FROM_STREAM state in either case. We'll
11669                                          * get immediate failure if we didn't launch walreceiver,
11670                                          * and move on to the next state.
11671                                          */
11672                                         currentSource = XLOG_FROM_STREAM;
11673                                         break;
11674
11675                                 case XLOG_FROM_STREAM:
11676
11677                                         /*
11678                                          * Failure while streaming. Most likely, we got here
11679                                          * because streaming replication was terminated, or
11680                                          * promotion was triggered. But we also get here if we
11681                                          * find an invalid record in the WAL streamed from master,
11682                                          * in which case something is seriously wrong. There's
11683                                          * little chance that the problem will just go away, but
11684                                          * PANIC is not good for availability either, especially
11685                                          * in hot standby mode. So, we treat that the same as
11686                                          * disconnection, and retry from archive/pg_wal again.
11687                                          * The WAL in the archive should be identical to what was
11688                                          * streamed, so it's unlikely that it helps, but one can
11689                                          * hope...
11690                                          */
11691
11692                                         /*
11693                                          * Before we leave XLOG_FROM_STREAM state, make sure that
11694                                          * walreceiver is not active, so that it won't overwrite
11695                                          * WAL that we restore from archive.
11696                                          */
11697                                         if (WalRcvStreaming())
11698                                                 ShutdownWalRcv();
11699
11700                                         /*
11701                                          * Before we sleep, re-scan for possible new timelines if
11702                                          * we were requested to recover to the latest timeline.
11703                                          */
11704                                         if (recoveryTargetIsLatest)
11705                                         {
11706                                                 if (rescanLatestTimeLine())
11707                                                 {
11708                                                         currentSource = XLOG_FROM_ARCHIVE;
11709                                                         break;
11710                                                 }
11711                                         }
11712
11713                                         /*
11714                                          * XLOG_FROM_STREAM is the last state in our state
11715                                          * machine, so we've exhausted all the options for
11716                                          * obtaining the requested WAL. We're going to loop back
11717                                          * and retry from the archive, but if it hasn't been long
11718                                          * since last attempt, sleep wal_retrieve_retry_interval
11719                                          * milliseconds to avoid busy-waiting.
11720                                          */
11721                                         now = GetCurrentTimestamp();
11722                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
11723                                                                                                 wal_retrieve_retry_interval))
11724                                         {
11725                                                 long            secs,
11726                                                                         wait_time;
11727                                                 int                     usecs;
11728
11729                                                 TimestampDifference(last_fail_time, now, &secs, &usecs);
11730                                                 wait_time = wal_retrieve_retry_interval -
11731                                                         (secs * 1000 + usecs / 1000);
11732
11733                                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
11734                                                          WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
11735                                                                   wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
11736                                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
11737                                                 now = GetCurrentTimestamp();
11738                                         }
11739                                         last_fail_time = now;
11740                                         currentSource = XLOG_FROM_ARCHIVE;
11741                                         break;
11742
11743                                 default:
11744                                         elog(ERROR, "unexpected WAL source %d", currentSource);
11745                         }
11746                 }
11747                 else if (currentSource == XLOG_FROM_PG_WAL)
11748                 {
11749                         /*
11750                          * We just successfully read a file in pg_wal. We prefer files in
11751                          * the archive over ones in pg_wal, so try the next file again
11752                          * from the archive first.
11753                          */
11754                         if (InArchiveRecovery)
11755                                 currentSource = XLOG_FROM_ARCHIVE;
11756                 }
11757
11758                 if (currentSource != oldSource)
11759                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
11760                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11761                                  lastSourceFailed ? "failure" : "success");
11762
11763                 /*
11764                  * We've now handled possible failure. Try to read from the chosen
11765                  * source.
11766                  */
11767                 lastSourceFailed = false;
11768
11769                 switch (currentSource)
11770                 {
11771                         case XLOG_FROM_ARCHIVE:
11772                         case XLOG_FROM_PG_WAL:
11773                                 /* Close any old file we might have open. */
11774                                 if (readFile >= 0)
11775                                 {
11776                                         close(readFile);
11777                                         readFile = -1;
11778                                 }
11779                                 /* Reset curFileTLI if random fetch. */
11780                                 if (randAccess)
11781                                         curFileTLI = 0;
11782
11783                                 /*
11784                                  * Try to restore the file from archive, or read an existing
11785                                  * file from pg_wal.
11786                                  */
11787                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
11788                                                  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
11789                                                                                           currentSource);
11790                                 if (readFile >= 0)
11791                                         return true;    /* success! */
11792
11793                                 /*
11794                                  * Nope, not found in archive or pg_wal.
11795                                  */
11796                                 lastSourceFailed = true;
11797                                 break;
11798
11799                         case XLOG_FROM_STREAM:
11800                                 {
11801                                         bool            havedata;
11802
11803                                         /*
11804                                          * Check if WAL receiver is still active.
11805                                          */
11806                                         if (!WalRcvStreaming())
11807                                         {
11808                                                 lastSourceFailed = true;
11809                                                 break;
11810                                         }
11811
11812                                         /*
11813                                          * Walreceiver is active, so see if new data has arrived.
11814                                          *
11815                                          * We only advance XLogReceiptTime when we obtain fresh
11816                                          * WAL from walreceiver and observe that we had already
11817                                          * processed everything before the most recent "chunk"
11818                                          * that it flushed to disk.  In steady state where we are
11819                                          * keeping up with the incoming data, XLogReceiptTime will
11820                                          * be updated on each cycle. When we are behind,
11821                                          * XLogReceiptTime will not advance, so the grace time
11822                                          * allotted to conflicting queries will decrease.
11823                                          */
11824                                         if (RecPtr < receivedUpto)
11825                                                 havedata = true;
11826                                         else
11827                                         {
11828                                                 XLogRecPtr      latestChunkStart;
11829
11830                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11831                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11832                                                 {
11833                                                         havedata = true;
11834                                                         if (latestChunkStart <= RecPtr)
11835                                                         {
11836                                                                 XLogReceiptTime = GetCurrentTimestamp();
11837                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
11838                                                         }
11839                                                 }
11840                                                 else
11841                                                         havedata = false;
11842                                         }
11843                                         if (havedata)
11844                                         {
11845                                                 /*
11846                                                  * Great, streamed far enough.  Open the file if it's
11847                                                  * not open already.  Also read the timeline history
11848                                                  * file if we haven't initialized timeline history
11849                                                  * yet; it should be streamed over and present in
11850                                                  * pg_wal by now.  Use XLOG_FROM_STREAM so that
11851                                                  * source info is set correctly and XLogReceiptTime
11852                                                  * isn't changed.
11853                                                  */
11854                                                 if (readFile < 0)
11855                                                 {
11856                                                         if (!expectedTLEs)
11857                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11858                                                         readFile = XLogFileRead(readSegNo, PANIC,
11859                                                                                                         receiveTLI,
11860                                                                                                         XLOG_FROM_STREAM, false);
11861                                                         Assert(readFile >= 0);
11862                                                 }
11863                                                 else
11864                                                 {
11865                                                         /* just make sure source info is correct... */
11866                                                         readSource = XLOG_FROM_STREAM;
11867                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11868                                                         return true;
11869                                                 }
11870                                                 break;
11871                                         }
11872
11873                                         /*
11874                                          * Data not here yet. Check for trigger, then wait for
11875                                          * walreceiver to wake us up when new WAL arrives.
11876                                          */
11877                                         if (CheckForStandbyTrigger())
11878                                         {
11879                                                 /*
11880                                                  * Note that we don't "return false" immediately here.
11881                                                  * After being triggered, we still want to replay all
11882                                                  * the WAL that was already streamed. It's in pg_wal
11883                                                  * now, so we just treat this as a failure, and the
11884                                                  * state machine will move on to replay the streamed
11885                                                  * WAL from pg_wal, and then recheck the trigger and
11886                                                  * exit replay.
11887                                                  */
11888                                                 lastSourceFailed = true;
11889                                                 break;
11890                                         }
11891
11892                                         /*
11893                                          * Since we have replayed everything we have received so
11894                                          * far and are about to start waiting for more WAL, let's
11895                                          * tell the upstream server our replay location now so
11896                                          * that pg_stat_replication doesn't show stale
11897                                          * information.
11898                                          */
11899                                         if (!streaming_reply_sent)
11900                                         {
11901                                                 WalRcvForceReply();
11902                                                 streaming_reply_sent = true;
11903                                         }
11904
11905                                         /*
11906                                          * Wait for more WAL to arrive. Time out after 5 seconds
11907                                          * to react to a trigger file promptly.
11908                                          */
11909                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11910                                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
11911                                                           5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
11912                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11913                                         break;
11914                                 }
11915
11916                         default:
11917                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11918                 }
11919
11920                 /*
11921                  * This possibly-long loop needs to handle interrupts of startup
11922                  * process.
11923                  */
11924                 HandleStartupProcInterrupts();
11925         }
11926
11927         return false;                           /* not reached */
11928 }
11929
11930 /*
11931  * Determine what log level should be used to report a corrupt WAL record
11932  * in the current WAL page, previously read by XLogPageRead().
11933  *
11934  * 'emode' is the error mode that would be used to report a file-not-found
11935  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11936  * we're retrying the exact same record that we've tried previously, only
11937  * complain the first time to keep the noise down.  However, we only do when
11938  * reading from pg_wal, because we don't expect any invalid records in archive
11939  * or in records streamed from master. Files in the archive should be complete,
11940  * and we should never hit the end of WAL because we stop and wait for more WAL
11941  * to arrive before replaying it.
11942  *
11943  * NOTE: This function remembers the RecPtr value it was last called with,
11944  * to suppress repeated messages about the same record. Only call this when
11945  * you are about to ereport(), or you might cause a later message to be
11946  * erroneously suppressed.
11947  */
11948 static int
11949 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11950 {
11951         static XLogRecPtr lastComplaint = 0;
11952
11953         if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
11954         {
11955                 if (RecPtr == lastComplaint)
11956                         emode = DEBUG1;
11957                 else
11958                         lastComplaint = RecPtr;
11959         }
11960         return emode;
11961 }
11962
11963 /*
11964  * Check to see whether the user-specified trigger file exists and whether a
11965  * promote request has arrived.  If either condition holds, return true.
11966  */
11967 static bool
11968 CheckForStandbyTrigger(void)
11969 {
11970         struct stat stat_buf;
11971         static bool triggered = false;
11972
11973         if (triggered)
11974                 return true;
11975
11976         if (IsPromoteTriggered())
11977         {
11978                 /*
11979                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11980                  * signal handler. It now leaves the file in place and lets the
11981                  * Startup process do the unlink. This allows Startup to know whether
11982                  * it should create a full checkpoint before starting up (fallback
11983                  * mode). Fast promotion takes precedence.
11984                  */
11985                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11986                 {
11987                         unlink(PROMOTE_SIGNAL_FILE);
11988                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11989                         fast_promote = true;
11990                 }
11991                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11992                 {
11993                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11994                         fast_promote = false;
11995                 }
11996
11997                 ereport(LOG, (errmsg("received promote request")));
11998
11999                 ResetPromoteTriggered();
12000                 triggered = true;
12001                 return true;
12002         }
12003
12004         if (TriggerFile == NULL)
12005                 return false;
12006
12007         if (stat(TriggerFile, &stat_buf) == 0)
12008         {
12009                 ereport(LOG,
12010                                 (errmsg("trigger file found: %s", TriggerFile)));
12011                 unlink(TriggerFile);
12012                 triggered = true;
12013                 fast_promote = true;
12014                 return true;
12015         }
12016         else if (errno != ENOENT)
12017                 ereport(ERROR,
12018                                 (errcode_for_file_access(),
12019                                  errmsg("could not stat trigger file \"%s\": %m",
12020                                                 TriggerFile)));
12021
12022         return false;
12023 }
12024
12025 /*
12026  * Remove the files signaling a standby promotion request.
12027  */
12028 void
12029 RemovePromoteSignalFiles(void)
12030 {
12031         unlink(PROMOTE_SIGNAL_FILE);
12032         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12033 }
12034
12035 /*
12036  * Check to see if a promote request has arrived. Should be
12037  * called by postmaster after receiving SIGUSR1.
12038  */
12039 bool
12040 CheckPromoteSignal(void)
12041 {
12042         struct stat stat_buf;
12043
12044         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12045                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12046                 return true;
12047
12048         return false;
12049 }
12050
12051 /*
12052  * Wake up startup process to replay newly arrived WAL, or to notice that
12053  * failover has been requested.
12054  */
12055 void
12056 WakeupRecovery(void)
12057 {
12058         SetLatch(&XLogCtl->recoveryWakeupLatch);
12059 }
12060
12061 /*
12062  * Update the WalWriterSleeping flag.
12063  */
12064 void
12065 SetWalWriterSleeping(bool sleeping)
12066 {
12067         SpinLockAcquire(&XLogCtl->info_lck);
12068         XLogCtl->WalWriterSleeping = sleeping;
12069         SpinLockRelease(&XLogCtl->info_lck);
12070 }
12071
12072 /*
12073  * Schedule a walreceiver wakeup in the main recovery loop.
12074  */
12075 void
12076 XLogRequestWalReceiverReply(void)
12077 {
12078         doRequestWalReceiverReply = true;
12079 }