granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL write-ahead log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <math.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <unistd.h>
  24
  25 #include "access/clog.h"
  26 #include "access/commit_ts.h"
  27 #include "access/multixact.h"
  28 #include "access/rewriteheap.h"
  29 #include "access/subtrans.h"
  30 #include "access/timeline.h"
  31 #include "access/transam.h"
  32 #include "access/tuptoaster.h"
  33 #include "access/twophase.h"
  34 #include "access/xact.h"
  35 #include "access/xlog_internal.h"
  36 #include "access/xloginsert.h"
  37 #include "access/xlogreader.h"
  38 #include "access/xlogutils.h"
  39 #include "catalog/catversion.h"
  40 #include "catalog/pg_control.h"
  41 #include "catalog/pg_database.h"
  42 #include "commands/tablespace.h"
  43 #include "miscadmin.h"
  44 #include "pgstat.h"
  45 #include "port/atomics.h"
  46 #include "postmaster/bgwriter.h"
  47 #include "postmaster/walwriter.h"
  48 #include "postmaster/startup.h"
  49 #include "replication/basebackup.h"
  50 #include "replication/logical.h"
  51 #include "replication/slot.h"
  52 #include "replication/origin.h"
  53 #include "replication/snapbuild.h"
  54 #include "replication/walreceiver.h"
  55 #include "replication/walsender.h"
  56 #include "storage/bufmgr.h"
  57 #include "storage/fd.h"
  58 #include "storage/ipc.h"
  59 #include "storage/large_object.h"
  60 #include "storage/latch.h"
  61 #include "storage/pmsignal.h"
  62 #include "storage/predicate.h"
  63 #include "storage/proc.h"
  64 #include "storage/procarray.h"
  65 #include "storage/reinit.h"
  66 #include "storage/smgr.h"
  67 #include "storage/spin.h"
  68 #include "utils/backend_random.h"
  69 #include "utils/builtins.h"
  70 #include "utils/guc.h"
  71 #include "utils/memutils.h"
  72 #include "utils/pg_lsn.h"
  73 #include "utils/ps_status.h"
  74 #include "utils/relmapper.h"
  75 #include "utils/snapmgr.h"
  76 #include "utils/timestamp.h"
  77 #include "pg_trace.h"
  78
  79 extern uint32 bootstrap_data_checksum_version;
  80
  81 /* File path names (all relative to $PGDATA) */
  82 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  83 #define RECOVERY_COMMAND_DONE   "recovery.done"
  84 #define PROMOTE_SIGNAL_FILE             "promote"
  85 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  86
  87
  88 /* User-settable parameters */
  89 int                     max_wal_size_mb = 1024; /* 1 GB */
  90 int                     min_wal_size_mb = 80;   /* 80 MB */
  91 int                     wal_keep_segments = 0;
  92 int                     XLOGbuffers = -1;
  93 int                     XLogArchiveTimeout = 0;
  94 int                     XLogArchiveMode = ARCHIVE_MODE_OFF;
  95 char       *XLogArchiveCommand = NULL;
  96 bool            EnableHotStandby = false;
  97 bool            fullPageWrites = true;
  98 bool            wal_log_hints = false;
  99 bool            wal_compression = false;
 100 char       *wal_consistency_checking_string = NULL;
 101 bool       *wal_consistency_checking = NULL;
 102 bool            log_checkpoints = false;
 103 int                     sync_method = DEFAULT_SYNC_METHOD;
 104 int                     wal_level = WAL_LEVEL_MINIMAL;
 105 int                     CommitDelay = 0;        /* precommit delay in microseconds */
 106 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 107 int                     wal_retrieve_retry_interval = 5000;
 108
 109 #ifdef WAL_DEBUG
 110 bool            XLOG_DEBUG = false;
 111 #endif
 112
 113 int                     wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
 114
 115 /*
 116  * Number of WAL insertion locks to use. A higher value allows more insertions
 117  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
 118  * which needs to iterate all the locks.
 119  */
 120 #define NUM_XLOGINSERT_LOCKS  8
 121
 122 /*
 123  * Max distance from last checkpoint, before triggering a new xlog-based
 124  * checkpoint.
 125  */
 126 int                     CheckPointSegments;
 127
 128 /* Estimated distance between checkpoints, in bytes */
 129 static double CheckPointDistanceEstimate = 0;
 130 static double PrevCheckPointDistance = 0;
 131
 132 /*
 133  * GUC support
 134  */
 135 const struct config_enum_entry sync_method_options[] = {
 136         {"fsync", SYNC_METHOD_FSYNC, false},
 137 #ifdef HAVE_FSYNC_WRITETHROUGH
 138         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 139 #endif
 140 #ifdef HAVE_FDATASYNC
 141         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 142 #endif
 143 #ifdef OPEN_SYNC_FLAG
 144         {"open_sync", SYNC_METHOD_OPEN, false},
 145 #endif
 146 #ifdef OPEN_DATASYNC_FLAG
 147         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 148 #endif
 149         {NULL, 0, false}
 150 };
 151
 152
 153 /*
 154  * Although only "on", "off", and "always" are documented,
 155  * we accept all the likely variants of "on" and "off".
 156  */
 157 const struct config_enum_entry archive_mode_options[] = {
 158         {"always", ARCHIVE_MODE_ALWAYS, false},
 159         {"on", ARCHIVE_MODE_ON, false},
 160         {"off", ARCHIVE_MODE_OFF, false},
 161         {"true", ARCHIVE_MODE_ON, true},
 162         {"false", ARCHIVE_MODE_OFF, true},
 163         {"yes", ARCHIVE_MODE_ON, true},
 164         {"no", ARCHIVE_MODE_OFF, true},
 165         {"1", ARCHIVE_MODE_ON, true},
 166         {"0", ARCHIVE_MODE_OFF, true},
 167         {NULL, 0, false}
 168 };
 169
 170 /*
 171  * Statistics for current checkpoint are collected in this global struct.
 172  * Because only the checkpointer or a stand-alone backend can perform
 173  * checkpoints, this will be unused in normal backends.
 174  */
 175 CheckpointStatsData CheckpointStats;
 176
 177 /*
 178  * ThisTimeLineID will be same in all backends --- it identifies current
 179  * WAL timeline for the database system.
 180  */
 181 TimeLineID      ThisTimeLineID = 0;
 182
 183 /*
 184  * Are we doing recovery from XLOG?
 185  *
 186  * This is only ever true in the startup process; it should be read as meaning
 187  * "this process is replaying WAL records", rather than "the system is in
 188  * recovery mode".  It should be examined primarily by functions that need
 189  * to act differently when called from a WAL redo function (e.g., to skip WAL
 190  * logging).  To check whether the system is in recovery regardless of which
 191  * process you're running in, use RecoveryInProgress() but only after shared
 192  * memory startup and lock initialization.
 193  */
 194 bool            InRecovery = false;
 195
 196 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 197 HotStandbyState standbyState = STANDBY_DISABLED;
 198
 199 static XLogRecPtr LastRec;
 200
 201 /* Local copy of WalRcv->receivedUpto */
 202 static XLogRecPtr receivedUpto = 0;
 203 static TimeLineID receiveTLI = 0;
 204
 205 /*
 206  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 207  * the replayed WAL records indicate. It's initialized with full_page_writes
 208  * that the recovery starting checkpoint record indicates, and then updated
 209  * each time XLOG_FPW_CHANGE record is replayed.
 210  */
 211 static bool lastFullPageWrites;
 212
 213 /*
 214  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 215  * known, need to check the shared state".
 216  */
 217 static bool LocalRecoveryInProgress = true;
 218
 219 /*
 220  * Local copy of SharedHotStandbyActive variable. False actually means "not
 221  * known, need to check the shared state".
 222  */
 223 static bool LocalHotStandbyActive = false;
 224
 225 /*
 226  * Local state for XLogInsertAllowed():
 227  *              1: unconditionally allowed to insert XLOG
 228  *              0: unconditionally not allowed to insert XLOG
 229  *              -1: must check RecoveryInProgress(); disallow until it is false
 230  * Most processes start with -1 and transition to 1 after seeing that recovery
 231  * is not in progress.  But we can also force the value for special cases.
 232  * The coding in XLogInsertAllowed() depends on the first two of these states
 233  * being numerically the same as bool true and false.
 234  */
 235 static int      LocalXLogInsertAllowed = -1;
 236
 237 /*
 238  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 239  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 240  * currently recovering using offline XLOG archives. These variables are only
 241  * valid in the startup process.
 242  *
 243  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 244  * currently performing crash recovery using only XLOG files in pg_wal, but
 245  * will switch to using offline XLOG archives as soon as we reach the end of
 246  * WAL in pg_wal.
 247 */
 248 bool            ArchiveRecoveryRequested = false;
 249 bool            InArchiveRecovery = false;
 250
 251 /* Was the last xlog file restored from archive, or local? */
 252 static bool restoredFromArchive = false;
 253
 254 /* Buffers dedicated to consistency checks of size BLCKSZ */
 255 static char *replay_image_masked = NULL;
 256 static char *master_image_masked = NULL;
 257
 258 /* options taken from recovery.conf for archive recovery */
 259 char       *recoveryRestoreCommand = NULL;
 260 static char *recoveryEndCommand = NULL;
 261 static char *archiveCleanupCommand = NULL;
 262 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 263 static bool recoveryTargetInclusive = true;
 264 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
 265 static TransactionId recoveryTargetXid;
 266 static TimestampTz recoveryTargetTime;
 267 static char *recoveryTargetName;
 268 static XLogRecPtr recoveryTargetLSN;
 269 static int      recovery_min_apply_delay = 0;
 270 static TimestampTz recoveryDelayUntilTime;
 271
 272 /* options taken from recovery.conf for XLOG streaming */
 273 static bool StandbyModeRequested = false;
 274 static char *PrimaryConnInfo = NULL;
 275 static char *PrimarySlotName = NULL;
 276 static char *TriggerFile = NULL;
 277
 278 /* are we currently in standby mode? */
 279 bool            StandbyMode = false;
 280
 281 /* whether request for fast promotion has been made yet */
 282 static bool fast_promote = false;
 283
 284 /*
 285  * if recoveryStopsBefore/After returns true, it saves information of the stop
 286  * point here
 287  */
 288 static TransactionId recoveryStopXid;
 289 static TimestampTz recoveryStopTime;
 290 static XLogRecPtr recoveryStopLSN;
 291 static char recoveryStopName[MAXFNAMELEN];
 292 static bool recoveryStopAfter;
 293
 294 /*
 295  * During normal operation, the only timeline we care about is ThisTimeLineID.
 296  * During recovery, however, things are more complicated.  To simplify life
 297  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 298  * scan through the WAL history (that is, it is the line that was active when
 299  * the currently-scanned WAL record was generated).  We also need these
 300  * timeline values:
 301  *
 302  * recoveryTargetTLI: the desired timeline that we want to end in.
 303  *
 304  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 305  *
 306  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 307  * its known parents, newest first (so recoveryTargetTLI is always the
 308  * first list member).  Only these TLIs are expected to be seen in the WAL
 309  * segments we read, and indeed only these TLIs will be considered as
 310  * candidate WAL files to open at all.
 311  *
 312  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 313  * (This is not necessarily the same as ThisTimeLineID, because we could
 314  * be scanning data that was copied from an ancestor timeline when the current
 315  * file was created.)  During a sequential scan we do not allow this value
 316  * to decrease.
 317  */
 318 static TimeLineID recoveryTargetTLI;
 319 static bool recoveryTargetIsLatest = false;
 320 static List *expectedTLEs;
 321 static TimeLineID curFileTLI;
 322
 323 /*
 324  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 325  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 326  * end+1 of the last record, and is reset when we end a top-level transaction,
 327  * or start a new one; so it can be used to tell if the current transaction has
 328  * created any XLOG records.
 329  *
 330  * While in parallel mode, this may not be fully up to date.  When committing,
 331  * a transaction can assume this covers all xlog records written either by the
 332  * user backend or by any parallel worker which was present at any point during
 333  * the transaction.  But when aborting, or when still in parallel mode, other
 334  * parallel backends may have written WAL records at later LSNs than the value
 335  * stored here.  The parallel leader advances its own copy, when necessary,
 336  * in WaitForParallelWorkersToFinish.
 337  */
 338 XLogRecPtr      ProcLastRecPtr = InvalidXLogRecPtr;
 339 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 340 XLogRecPtr      XactLastCommitEnd = InvalidXLogRecPtr;
 341
 342 /*
 343  * RedoRecPtr is this backend's local copy of the REDO record pointer
 344  * (which is almost but not quite the same as a pointer to the most recent
 345  * CHECKPOINT record).  We update this from the shared-memory copy,
 346  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 347  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
 348  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 349  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 350  * InitXLOGAccess.
 351  */
 352 static XLogRecPtr RedoRecPtr;
 353
 354 /*
 355  * doPageWrites is this backend's local copy of (forcePageWrites ||
 356  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
 357  * a full-page image of a page need to be taken.
 358  */
 359 static bool doPageWrites;
 360
 361 /* Has the recovery code requested a walreceiver wakeup? */
 362 static bool doRequestWalReceiverReply;
 363
 364 /*
 365  * RedoStartLSN points to the checkpoint's REDO location which is specified
 366  * in a backup label file, backup history file or control file. In standby
 367  * mode, XLOG streaming usually starts from the position where an invalid
 368  * record was found. But if we fail to read even the initial checkpoint
 369  * record, we use the REDO location instead of the checkpoint location as
 370  * the start position of XLOG streaming. Otherwise we would have to jump
 371  * backwards to the REDO location after reading the checkpoint record,
 372  * because the REDO record can precede the checkpoint record.
 373  */
 374 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 375
 376 /*----------
 377  * Shared-memory data structures for XLOG control
 378  *
 379  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 380  * the log up to (all records before that point must be written or fsynced).
 381  * LogwrtResult indicates the byte positions we have already written/fsynced.
 382  * These structs are identical but are declared separately to indicate their
 383  * slightly different functions.
 384  *
 385  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 386  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 387  * this arrangement is that the value can be examined by code that already
 388  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 389  * to the shared variable, each backend has a private copy of LogwrtResult,
 390  * which is updated when convenient.
 391  *
 392  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 393  * (protected by info_lck), but we don't need to cache any copies of it.
 394  *
 395  * info_lck is only held long enough to read/update the protected variables,
 396  * so it's a plain spinlock.  The other locks are held longer (potentially
 397  * over I/O operations), so we use LWLocks for them.  These locks are:
 398  *
 399  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 400  * It is only held while initializing and changing the mapping.  If the
 401  * contents of the buffer being replaced haven't been written yet, the mapping
 402  * lock is released while the write is done, and reacquired afterwards.
 403  *
 404  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 405  * XLogFlush).
 406  *
 407  * ControlFileLock: must be held to read/update control file or create
 408  * new log file.
 409  *
 410  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 411  * only one checkpointer at a time; currently, with all checkpoints done by
 412  * the checkpointer, this is just pro forma).
 413  *
 414  *----------
 415  */
 416
 417 typedef struct XLogwrtRqst
 418 {
 419         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 420         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 421 } XLogwrtRqst;
 422
 423 typedef struct XLogwrtResult
 424 {
 425         XLogRecPtr      Write;                  /* last byte + 1 written out */
 426         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 427 } XLogwrtResult;
 428
 429 /*
 430  * Inserting to WAL is protected by a small fixed number of WAL insertion
 431  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 432  * matter which one. To lock out other concurrent insertions, you must hold
 433  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 434  * indicator of how far the insertion has progressed (insertingAt).
 435  *
 436  * The insertingAt values are read when a process wants to flush WAL from
 437  * the in-memory buffers to disk, to check that all the insertions to the
 438  * region the process is about to write out have finished. You could simply
 439  * wait for all currently in-progress insertions to finish, but the
 440  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 441  * so that you only wait for the insertions that are modifying the buffers
 442  * you're about to write out.
 443  *
 444  * This isn't just an optimization. If all the WAL buffers are dirty, an
 445  * inserter that's holding a WAL insert lock might need to evict an old WAL
 446  * buffer, which requires flushing the WAL. If it's possible for an inserter
 447  * to block on another inserter unnecessarily, deadlock can arise when two
 448  * inserters holding a WAL insert lock wait for each other to finish their
 449  * insertion.
 450  *
 451  * Small WAL records that don't cross a page boundary never update the value,
 452  * the WAL record is just copied to the page and the lock is released. But
 453  * to avoid the deadlock-scenario explained above, the indicator is always
 454  * updated before sleeping while holding an insertion lock.
 455  *
 456  * lastImportantAt contains the LSN of the last important WAL record inserted
 457  * using a given lock. This value is used to detect if there has been
 458  * important WAL activity since the last time some action, like a checkpoint,
 459  * was performed - allowing to not repeat the action if not. The LSN is
 460  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
 461  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
 462  * records.  Tracking the WAL activity directly in WALInsertLock has the
 463  * advantage of not needing any additional locks to update the value.
 464  */
 465 typedef struct
 466 {
 467         LWLock          lock;
 468         XLogRecPtr      insertingAt;
 469         XLogRecPtr      lastImportantAt;
 470 } WALInsertLock;
 471
 472 /*
 473  * All the WAL insertion locks are allocated as an array in shared memory. We
 474  * force the array stride to be a power of 2, which saves a few cycles in
 475  * indexing, but more importantly also ensures that individual slots don't
 476  * cross cache line boundaries. (Of course, we have to also ensure that the
 477  * array start address is suitably aligned.)
 478  */
 479 typedef union WALInsertLockPadded
 480 {
 481         WALInsertLock l;
 482         char            pad[PG_CACHE_LINE_SIZE];
 483 } WALInsertLockPadded;
 484
 485 /*
 486  * State of an exclusive backup, necessary to control concurrent activities
 487  * across sessions when working on exclusive backups.
 488  *
 489  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
 490  * running, to be more precise pg_start_backup() is not being executed for
 491  * an exclusive backup and there is no exclusive backup in progress.
 492  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
 493  * exclusive backup.
 494  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
 495  * running and an exclusive backup is in progress. pg_stop_backup() is
 496  * needed to finish it.
 497  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
 498  * exclusive backup.
 499  */
 500 typedef enum ExclusiveBackupState
 501 {
 502         EXCLUSIVE_BACKUP_NONE = 0,
 503         EXCLUSIVE_BACKUP_STARTING,
 504         EXCLUSIVE_BACKUP_IN_PROGRESS,
 505         EXCLUSIVE_BACKUP_STOPPING
 506 } ExclusiveBackupState;
 507
 508 /*
 509  * Session status of running backup, used for sanity checks in SQL-callable
 510  * functions to start and stop backups.
 511  */
 512 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
 513
 514 /*
 515  * Shared state data for WAL insertion.
 516  */
 517 typedef struct XLogCtlInsert
 518 {
 519         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 520
 521         /*
 522          * CurrBytePos is the end of reserved WAL. The next record will be
 523          * inserted at that position. PrevBytePos is the start position of the
 524          * previously inserted (or rather, reserved) record - it is copied to the
 525          * prev-link of the next record. These are stored as "usable byte
 526          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 527          */
 528         uint64          CurrBytePos;
 529         uint64          PrevBytePos;
 530
 531         /*
 532          * Make sure the above heavily-contended spinlock and byte positions are
 533          * on their own cache line. In particular, the RedoRecPtr and full page
 534          * write variables below should be on a different cache line. They are
 535          * read on every WAL insertion, but updated rarely, and we don't want
 536          * those reads to steal the cache line containing Curr/PrevBytePos.
 537          */
 538         char            pad[PG_CACHE_LINE_SIZE];
 539
 540         /*
 541          * fullPageWrites is the master copy used by all backends to determine
 542          * whether to write full-page to WAL, instead of using process-local one.
 543          * This is required because, when full_page_writes is changed by SIGHUP,
 544          * we must WAL-log it before it actually affects WAL-logging by backends.
 545          * Checkpointer sets at startup or after SIGHUP.
 546          *
 547          * To read these fields, you must hold an insertion lock. To modify them,
 548          * you must hold ALL the locks.
 549          */
 550         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 551         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 552         bool            fullPageWrites;
 553
 554         /*
 555          * exclusiveBackupState indicates the state of an exclusive backup (see
 556          * comments of ExclusiveBackupState for more details). nonExclusiveBackups
 557          * is a counter indicating the number of streaming base backups currently
 558          * in progress. forcePageWrites is set to true when either of these is
 559          * non-zero. lastBackupStart is the latest checkpoint redo location used
 560          * as a starting point for an online backup.
 561          */
 562         ExclusiveBackupState exclusiveBackupState;
 563         int                     nonExclusiveBackups;
 564         XLogRecPtr      lastBackupStart;
 565
 566         /*
 567          * WAL insertion locks.
 568          */
 569         WALInsertLockPadded *WALInsertLocks;
 570 } XLogCtlInsert;
 571
 572 /*
 573  * Total shared-memory state for XLOG.
 574  */
 575 typedef struct XLogCtlData
 576 {
 577         XLogCtlInsert Insert;
 578
 579         /* Protected by info_lck: */
 580         XLogwrtRqst LogwrtRqst;
 581         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 582         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 583         TransactionId ckptXid;
 584         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 585         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 586
 587         XLogSegNo       lastRemovedSegNo;       /* latest removed/recycled XLOG segment */
 588
 589         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 590         XLogRecPtr      unloggedLSN;
 591         slock_t         ulsn_lck;
 592
 593         /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
 594         pg_time_t       lastSegSwitchTime;
 595         XLogRecPtr      lastSegSwitchLSN;
 596
 597         /*
 598          * Protected by info_lck and WALWriteLock (you must hold either lock to
 599          * read it, but both to update)
 600          */
 601         XLogwrtResult LogwrtResult;
 602
 603         /*
 604          * Latest initialized page in the cache (last byte position + 1).
 605          *
 606          * To change the identity of a buffer (and InitializedUpTo), you need to
 607          * hold WALBufMappingLock.  To change the identity of a buffer that's
 608          * still dirty, the old page needs to be written out first, and for that
 609          * you need WALWriteLock, and you need to ensure that there are no
 610          * in-progress insertions to the page by calling
 611          * WaitXLogInsertionsToFinish().
 612          */
 613         XLogRecPtr      InitializedUpTo;
 614
 615         /*
 616          * These values do not change after startup, although the pointed-to pages
 617          * and xlblocks values certainly do.  xlblock values are protected by
 618          * WALBufMappingLock.
 619          */
 620         char       *pages;                      /* buffers for unwritten XLOG pages */
 621         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 622         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 623
 624         /*
 625          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 626          * If we created a new timeline when the system was started up,
 627          * PrevTimeLineID is the old timeline's ID that we forked off from.
 628          * Otherwise it's equal to ThisTimeLineID.
 629          */
 630         TimeLineID      ThisTimeLineID;
 631         TimeLineID      PrevTimeLineID;
 632
 633         /*
 634          * archiveCleanupCommand is read from recovery.conf but needs to be in
 635          * shared memory so that the checkpointer process can access it.
 636          */
 637         char            archiveCleanupCommand[MAXPGPATH];
 638
 639         /*
 640          * SharedRecoveryInProgress indicates if we're still in crash or archive
 641          * recovery.  Protected by info_lck.
 642          */
 643         bool            SharedRecoveryInProgress;
 644
 645         /*
 646          * SharedHotStandbyActive indicates if we're still in crash or archive
 647          * recovery.  Protected by info_lck.
 648          */
 649         bool            SharedHotStandbyActive;
 650
 651         /*
 652          * WalWriterSleeping indicates whether the WAL writer is currently in
 653          * low-power mode (and hence should be nudged if an async commit occurs).
 654          * Protected by info_lck.
 655          */
 656         bool            WalWriterSleeping;
 657
 658         /*
 659          * recoveryWakeupLatch is used to wake up the startup process to continue
 660          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 661          * to appear.
 662          */
 663         Latch           recoveryWakeupLatch;
 664
 665         /*
 666          * During recovery, we keep a copy of the latest checkpoint record here.
 667          * lastCheckPointRecPtr points to start of checkpoint record and
 668          * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
 669          * checkpointer when it wants to create a restartpoint.
 670          *
 671          * Protected by info_lck.
 672          */
 673         XLogRecPtr      lastCheckPointRecPtr;
 674         XLogRecPtr      lastCheckPointEndPtr;
 675         CheckPoint      lastCheckPoint;
 676
 677         /*
 678          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 679          * replayed. When we're currently replaying a record, ie. in a redo
 680          * function, replayEndRecPtr points to the end+1 of the record being
 681          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 682          */
 683         XLogRecPtr      lastReplayedEndRecPtr;
 684         TimeLineID      lastReplayedTLI;
 685         XLogRecPtr      replayEndRecPtr;
 686         TimeLineID      replayEndTLI;
 687         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 688         TimestampTz recoveryLastXTime;
 689
 690         /*
 691          * timestamp of when we started replaying the current chunk of WAL data,
 692          * only relevant for replication or archive recovery
 693          */
 694         TimestampTz currentChunkStartTime;
 695         /* Are we requested to pause recovery? */
 696         bool            recoveryPause;
 697
 698         /*
 699          * lastFpwDisableRecPtr points to the start of the last replayed
 700          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 701          */
 702         XLogRecPtr      lastFpwDisableRecPtr;
 703
 704         slock_t         info_lck;               /* locks shared variables shown above */
 705 } XLogCtlData;
 706
 707 static XLogCtlData *XLogCtl = NULL;
 708
 709 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 710 static WALInsertLockPadded *WALInsertLocks = NULL;
 711
 712 /*
 713  * We maintain an image of pg_control in shared memory.
 714  */
 715 static ControlFileData *ControlFile = NULL;
 716
 717 /*
 718  * Calculate the amount of space left on the page after 'endptr'. Beware
 719  * multiple evaluation!
 720  */
 721 #define INSERT_FREESPACE(endptr)        \
 722         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 723
 724 /* Macro to advance to next buffer index. */
 725 #define NextBufIdx(idx)         \
 726                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 727
 728 /*
 729  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 730  * would hold if it was in cache, the page containing 'recptr'.
 731  */
 732 #define XLogRecPtrToBufIdx(recptr)      \
 733         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 734
 735 /*
 736  * These are the number of bytes in a WAL page usable for WAL data.
 737  */
 738 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 739
 740 /* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
 741 #define ConvertToXSegs(x, segsize)      \
 742         (x / ((segsize) / (1024 * 1024)))
 743
 744 /* The number of bytes in a WAL segment usable for WAL data. */
 745 static int      UsableBytesInSegment;
 746
 747 /*
 748  * Private, possibly out-of-date copy of shared LogwrtResult.
 749  * See discussion above.
 750  */
 751 static XLogwrtResult LogwrtResult = {0, 0};
 752
 753 /*
 754  * Codes indicating where we got a WAL file from during recovery, or where
 755  * to attempt to get one.
 756  */
 757 typedef enum
 758 {
 759         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 760         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 761         XLOG_FROM_PG_WAL,                       /* existing file in pg_wal */
 762         XLOG_FROM_STREAM                        /* streamed from master */
 763 } XLogSource;
 764
 765 /* human-readable names for XLogSources, for debugging output */
 766 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
 767
 768 /*
 769  * openLogFile is -1 or a kernel FD for an open log file segment.
 770  * When it's open, openLogOff is the current seek offset in the file.
 771  * openLogSegNo identifies the segment.  These variables are only
 772  * used to write the XLOG, and so will normally refer to the active segment.
 773  */
 774 static int      openLogFile = -1;
 775 static XLogSegNo openLogSegNo = 0;
 776 static uint32 openLogOff = 0;
 777
 778 /*
 779  * These variables are used similarly to the ones above, but for reading
 780  * the XLOG.  Note, however, that readOff generally represents the offset
 781  * of the page just read, not the seek position of the FD itself, which
 782  * will be just past that page. readLen indicates how much of the current
 783  * page has been read into readBuf, and readSource indicates where we got
 784  * the currently open file from.
 785  */
 786 static int      readFile = -1;
 787 static XLogSegNo readSegNo = 0;
 788 static uint32 readOff = 0;
 789 static uint32 readLen = 0;
 790 static XLogSource readSource = 0;       /* XLOG_FROM_* code */
 791
 792 /*
 793  * Keeps track of which source we're currently reading from. This is
 794  * different from readSource in that this is always set, even when we don't
 795  * currently have a WAL file open. If lastSourceFailed is set, our last
 796  * attempt to read from currentSource failed, and we should try another source
 797  * next.
 798  */
 799 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 800 static bool lastSourceFailed = false;
 801
 802 typedef struct XLogPageReadPrivate
 803 {
 804         int                     emode;
 805         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 806         bool            randAccess;
 807 } XLogPageReadPrivate;
 808
 809 /*
 810  * These variables track when we last obtained some WAL data to process,
 811  * and where we got it from.  (XLogReceiptSource is initially the same as
 812  * readSource, but readSource gets reset to zero when we don't have data
 813  * to process right now.  It is also different from currentSource, which
 814  * also changes when we try to read from a source and fail, while
 815  * XLogReceiptSource tracks where we last successfully read some WAL.)
 816  */
 817 static TimestampTz XLogReceiptTime = 0;
 818 static XLogSource XLogReceiptSource = 0;        /* XLOG_FROM_* code */
 819
 820 /* State information for XLOG reading */
 821 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 822 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 823
 824 /*
 825  * Local copies of equivalent fields in the control file.  When running
 826  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
 827  * expect to replay all the WAL available, and updateMinRecoveryPoint is
 828  * switched to false to prevent any updates while replaying records.
 829  * Those values are kept consistent as long as crash recovery runs.
 830  */
 831 static XLogRecPtr minRecoveryPoint;
 832 static TimeLineID minRecoveryPointTLI;
 833 static bool updateMinRecoveryPoint = true;
 834
 835 /*
 836  * Have we reached a consistent database state? In crash recovery, we have
 837  * to replay all the WAL, so reachedConsistency is never set. During archive
 838  * recovery, the database is consistent once minRecoveryPoint is reached.
 839  */
 840 bool            reachedConsistency = false;
 841
 842 static bool InRedo = false;
 843
 844 /* Have we launched bgwriter during recovery? */
 845 static bool bgwriterLaunched = false;
 846
 847 /* For WALInsertLockAcquire/Release functions */
 848 static int      MyLockNo = 0;
 849 static bool holdingAllLocks = false;
 850
 851 #ifdef WAL_DEBUG
 852 static MemoryContext walDebugCxt = NULL;
 853 #endif
 854
 855 static void readRecoveryCommandFile(void);
 856 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
 857 static bool recoveryStopsBefore(XLogReaderState *record);
 858 static bool recoveryStopsAfter(XLogReaderState *record);
 859 static void recoveryPausesHere(void);
 860 static bool recoveryApplyDelay(XLogReaderState *record);
 861 static void SetLatestXTime(TimestampTz xtime);
 862 static void SetCurrentChunkStartTime(TimestampTz xtime);
 863 static void CheckRequiredParameterValues(void);
 864 static void XLogReportParameters(void);
 865 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 866                                         TimeLineID prevTLI);
 867 static void LocalSetXLogInsertAllowed(void);
 868 static void CreateEndOfRecoveryRecord(void);
 869 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 870 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 871 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 872
 873 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 874 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 875 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 876 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 877                                            bool find_free, XLogSegNo max_segno,
 878                                            bool use_lock);
 879 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 880                          int source, bool notfoundOk);
 881 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 882 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 883                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 884                          TimeLineID *readTLI);
 885 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 886                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 887 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 888 static void XLogFileClose(void);
 889 static void PreallocXlogFiles(XLogRecPtr endptr);
 890 static void RemoveTempXlogFiles(void);
 891 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
 892 static void RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
 893 static void UpdateLastRemovedPtr(char *filename);
 894 static void ValidateXLOGDirectoryStructure(void);
 895 static void CleanupBackupHistory(void);
 896 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 897 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 898                    int emode, bool fetching_ckpt);
 899 static void CheckRecoveryConsistency(void);
 900 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 901                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 902 static bool rescanLatestTimeLine(void);
 903 static void WriteControlFile(void);
 904 static void ReadControlFile(void);
 905 static char *str_time(pg_time_t tnow);
 906 static bool CheckForStandbyTrigger(void);
 907
 908 #ifdef WAL_DEBUG
 909 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 910 #endif
 911 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 912 static void pg_start_backup_callback(int code, Datum arg);
 913 static void pg_stop_backup_callback(int code, Datum arg);
 914 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 915                                   bool *backupEndRequired, bool *backupFromStandby);
 916 static bool read_tablespace_map(List **tablespaces);
 917
 918 static void rm_redo_error_callback(void *arg);
 919 static int      get_sync_bit(int method);
 920
 921 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 922                                         XLogRecData *rdata,
 923                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
 924 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 925                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 926 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 927                                   XLogRecPtr *PrevPtr);
 928 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 929 static char *GetXLogBuffer(XLogRecPtr ptr);
 930 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 931 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 932 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 933 static void checkXLogConsistency(XLogReaderState *record);
 934
 935 static void WALInsertLockAcquire(void);
 936 static void WALInsertLockAcquireExclusive(void);
 937 static void WALInsertLockRelease(void);
 938 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 939
 940 /*
 941  * Insert an XLOG record represented by an already-constructed chain of data
 942  * chunks.  This is a low-level routine; to construct the WAL record header
 943  * and data, use the higher-level routines in xloginsert.c.
 944  *
 945  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 946  * WAL record applies to, that were not included in the record as full page
 947  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
 948  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 949  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 950  * record is always inserted.
 951  *
 952  * 'flags' gives more in-depth control on the record being inserted. See
 953  * XLogSetRecordFlags() for details.
 954  *
 955  * The first XLogRecData in the chain must be for the record header, and its
 956  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 957  * xl_crc fields in the header, the rest of the header must already be filled
 958  * by the caller.
 959  *
 960  * Returns XLOG pointer to end of record (beginning of next record).
 961  * This can be used as LSN for data pages affected by the logged action.
 962  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 963  * before the data page can be written out.  This implements the basic
 964  * WAL rule "write the log before the data".)
 965  */
 966 XLogRecPtr
 967 XLogInsertRecord(XLogRecData *rdata,
 968                                  XLogRecPtr fpw_lsn,
 969                                  uint8 flags)
 970 {
 971         XLogCtlInsert *Insert = &XLogCtl->Insert;
 972         pg_crc32c       rdata_crc;
 973         bool            inserted;
 974         XLogRecord *rechdr = (XLogRecord *) rdata->data;
 975         uint8           info = rechdr->xl_info & ~XLR_INFO_MASK;
 976         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
 977                                                            info == XLOG_SWITCH);
 978         XLogRecPtr      StartPos;
 979         XLogRecPtr      EndPos;
 980
 981         /* we assume that all of the record header is in the first chunk */
 982         Assert(rdata->len >= SizeOfXLogRecord);
 983
 984         /* cross-check on whether we should be here or not */
 985         if (!XLogInsertAllowed())
 986                 elog(ERROR, "cannot make new WAL entries during recovery");
 987
 988         /*----------
 989          *
 990          * We have now done all the preparatory work we can without holding a
 991          * lock or modifying shared state. From here on, inserting the new WAL
 992          * record to the shared WAL buffer cache is a two-step process:
 993          *
 994          * 1. Reserve the right amount of space from the WAL. The current head of
 995          *        reserved space is kept in Insert->CurrBytePos, and is protected by
 996          *        insertpos_lck.
 997          *
 998          * 2. Copy the record to the reserved WAL space. This involves finding the
 999          *        correct WAL buffer containing the reserved space, and copying the
1000          *        record in place. This can be done concurrently in multiple processes.
1001          *
1002          * To keep track of which insertions are still in-progress, each concurrent
1003          * inserter acquires an insertion lock. In addition to just indicating that
1004          * an insertion is in progress, the lock tells others how far the inserter
1005          * has progressed. There is a small fixed number of insertion locks,
1006          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1007          * boundary, it updates the value stored in the lock to the how far it has
1008          * inserted, to allow the previous buffer to be flushed.
1009          *
1010          * Holding onto an insertion lock also protects RedoRecPtr and
1011          * fullPageWrites from changing until the insertion is finished.
1012          *
1013          * Step 2 can usually be done completely in parallel. If the required WAL
1014          * page is not initialized yet, you have to grab WALBufMappingLock to
1015          * initialize it, but the WAL writer tries to do that ahead of insertions
1016          * to avoid that from happening in the critical path.
1017          *
1018          *----------
1019          */
1020         START_CRIT_SECTION();
1021         if (isLogSwitch)
1022                 WALInsertLockAcquireExclusive();
1023         else
1024                 WALInsertLockAcquire();
1025
1026         /*
1027          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
1028          * If so, may have to go back and have the caller recompute everything.
1029          * This can only happen just after a checkpoint, so it's better to be slow
1030          * in this case and fast otherwise.
1031          *
1032          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1033          * affect the contents of the XLOG record, so we'll update our local copy
1034          * but not force a recomputation.  (If doPageWrites was just turned off,
1035          * we could recompute the record without full pages, but we choose not to
1036          * bother.)
1037          */
1038         if (RedoRecPtr != Insert->RedoRecPtr)
1039         {
1040                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1041                 RedoRecPtr = Insert->RedoRecPtr;
1042         }
1043         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1044
1045         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
1046         {
1047                 /*
1048                  * Oops, some buffer now needs to be backed up that the caller didn't
1049                  * back up.  Start over.
1050                  */
1051                 WALInsertLockRelease();
1052                 END_CRIT_SECTION();
1053                 return InvalidXLogRecPtr;
1054         }
1055
1056         /*
1057          * Reserve space for the record in the WAL. This also sets the xl_prev
1058          * pointer.
1059          */
1060         if (isLogSwitch)
1061                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1062         else
1063         {
1064                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1065                                                                   &rechdr->xl_prev);
1066                 inserted = true;
1067         }
1068
1069         if (inserted)
1070         {
1071                 /*
1072                  * Now that xl_prev has been filled in, calculate CRC of the record
1073                  * header.
1074                  */
1075                 rdata_crc = rechdr->xl_crc;
1076                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1077                 FIN_CRC32C(rdata_crc);
1078                 rechdr->xl_crc = rdata_crc;
1079
1080                 /*
1081                  * All the record data, including the header, is now ready to be
1082                  * inserted. Copy the record in the space reserved.
1083                  */
1084                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1085                                                         StartPos, EndPos);
1086
1087                 /*
1088                  * Unless record is flagged as not important, update LSN of last
1089                  * important record in the current slot. When holding all locks, just
1090                  * update the first one.
1091                  */
1092                 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1093                 {
1094                         int                     lockno = holdingAllLocks ? 0 : MyLockNo;
1095
1096                         WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1097                 }
1098         }
1099         else
1100         {
1101                 /*
1102                  * This was an xlog-switch record, but the current insert location was
1103                  * already exactly at the beginning of a segment, so there was no need
1104                  * to do anything.
1105                  */
1106         }
1107
1108         /*
1109          * Done! Let others know that we're finished.
1110          */
1111         WALInsertLockRelease();
1112
1113         MarkCurrentTransactionIdLoggedIfAny();
1114
1115         END_CRIT_SECTION();
1116
1117         /*
1118          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1119          */
1120         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1121         {
1122                 SpinLockAcquire(&XLogCtl->info_lck);
1123                 /* advance global request to include new block(s) */
1124                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1125                         XLogCtl->LogwrtRqst.Write = EndPos;
1126                 /* update local result copy while I have the chance */
1127                 LogwrtResult = XLogCtl->LogwrtResult;
1128                 SpinLockRelease(&XLogCtl->info_lck);
1129         }
1130
1131         /*
1132          * If this was an XLOG_SWITCH record, flush the record and the empty
1133          * padding space that fills the rest of the segment, and perform
1134          * end-of-segment actions (eg, notifying archiver).
1135          */
1136         if (isLogSwitch)
1137         {
1138                 TRACE_POSTGRESQL_WAL_SWITCH();
1139                 XLogFlush(EndPos);
1140
1141                 /*
1142                  * Even though we reserved the rest of the segment for us, which is
1143                  * reflected in EndPos, we return a pointer to just the end of the
1144                  * xlog-switch record.
1145                  */
1146                 if (inserted)
1147                 {
1148                         EndPos = StartPos + SizeOfXLogRecord;
1149                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1150                         {
1151                                 uint64          offset = XLogSegmentOffset(EndPos, wal_segment_size);
1152
1153                                 if (offset == EndPos % XLOG_BLCKSZ)
1154                                         EndPos += SizeOfXLogLongPHD;
1155                                 else
1156                                         EndPos += SizeOfXLogShortPHD;
1157                         }
1158                 }
1159         }
1160
1161 #ifdef WAL_DEBUG
1162         if (XLOG_DEBUG)
1163         {
1164                 static XLogReaderState *debug_reader = NULL;
1165                 StringInfoData buf;
1166                 StringInfoData recordBuf;
1167                 char       *errormsg = NULL;
1168                 MemoryContext oldCxt;
1169
1170                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1171
1172                 initStringInfo(&buf);
1173                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1174                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1175
1176                 /*
1177                  * We have to piece together the WAL record data from the XLogRecData
1178                  * entries, so that we can pass it to the rm_desc function as one
1179                  * contiguous chunk.
1180                  */
1181                 initStringInfo(&recordBuf);
1182                 for (; rdata != NULL; rdata = rdata->next)
1183                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1184
1185                 if (!debug_reader)
1186                         debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
1187
1188                 if (!debug_reader)
1189                 {
1190                         appendStringInfoString(&buf, "error decoding record: out of memory");
1191                 }
1192                 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1193                                                                    &errormsg))
1194                 {
1195                         appendStringInfo(&buf, "error decoding record: %s",
1196                                                          errormsg ? errormsg : "no error message");
1197                 }
1198                 else
1199                 {
1200                         appendStringInfoString(&buf, " - ");
1201                         xlog_outdesc(&buf, debug_reader);
1202                 }
1203                 elog(LOG, "%s", buf.data);
1204
1205                 pfree(buf.data);
1206                 pfree(recordBuf.data);
1207                 MemoryContextSwitchTo(oldCxt);
1208         }
1209 #endif
1210
1211         /*
1212          * Update our global variables
1213          */
1214         ProcLastRecPtr = StartPos;
1215         XactLastRecEnd = EndPos;
1216
1217         return EndPos;
1218 }
1219
1220 /*
1221  * Reserves the right amount of space for a record of given size from the WAL.
1222  * *StartPos is set to the beginning of the reserved section, *EndPos to
1223  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1224  * used to set the xl_prev of this record.
1225  *
1226  * This is the performance critical part of XLogInsert that must be serialized
1227  * across backends. The rest can happen mostly in parallel. Try to keep this
1228  * section as short as possible, insertpos_lck can be heavily contended on a
1229  * busy system.
1230  *
1231  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1232  * where we actually copy the record to the reserved space.
1233  */
1234 static void
1235 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1236                                                   XLogRecPtr *PrevPtr)
1237 {
1238         XLogCtlInsert *Insert = &XLogCtl->Insert;
1239         uint64          startbytepos;
1240         uint64          endbytepos;
1241         uint64          prevbytepos;
1242
1243         size = MAXALIGN(size);
1244
1245         /* All (non xlog-switch) records should contain data. */
1246         Assert(size > SizeOfXLogRecord);
1247
1248         /*
1249          * The duration the spinlock needs to be held is minimized by minimizing
1250          * the calculations that have to be done while holding the lock. The
1251          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1252          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1253          * page headers. The mapping between "usable" byte positions and physical
1254          * positions (XLogRecPtrs) can be done outside the locked region, and
1255          * because the usable byte position doesn't include any headers, reserving
1256          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1257          */
1258         SpinLockAcquire(&Insert->insertpos_lck);
1259
1260         startbytepos = Insert->CurrBytePos;
1261         endbytepos = startbytepos + size;
1262         prevbytepos = Insert->PrevBytePos;
1263         Insert->CurrBytePos = endbytepos;
1264         Insert->PrevBytePos = startbytepos;
1265
1266         SpinLockRelease(&Insert->insertpos_lck);
1267
1268         *StartPos = XLogBytePosToRecPtr(startbytepos);
1269         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1270         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1271
1272         /*
1273          * Check that the conversions between "usable byte positions" and
1274          * XLogRecPtrs work consistently in both directions.
1275          */
1276         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1277         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1278         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1279 }
1280
1281 /*
1282  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1283  *
1284  * A log-switch record is handled slightly differently. The rest of the
1285  * segment will be reserved for this insertion, as indicated by the returned
1286  * *EndPos value. However, if we are already at the beginning of the current
1287  * segment, *StartPos and *EndPos are set to the current location without
1288  * reserving any space, and the function returns false.
1289 */
1290 static bool
1291 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1292 {
1293         XLogCtlInsert *Insert = &XLogCtl->Insert;
1294         uint64          startbytepos;
1295         uint64          endbytepos;
1296         uint64          prevbytepos;
1297         uint32          size = MAXALIGN(SizeOfXLogRecord);
1298         XLogRecPtr      ptr;
1299         uint32          segleft;
1300
1301         /*
1302          * These calculations are a bit heavy-weight to be done while holding a
1303          * spinlock, but since we're holding all the WAL insertion locks, there
1304          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1305          * compete for it, but that's not called very frequently.
1306          */
1307         SpinLockAcquire(&Insert->insertpos_lck);
1308
1309         startbytepos = Insert->CurrBytePos;
1310
1311         ptr = XLogBytePosToEndRecPtr(startbytepos);
1312         if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1313         {
1314                 SpinLockRelease(&Insert->insertpos_lck);
1315                 *EndPos = *StartPos = ptr;
1316                 return false;
1317         }
1318
1319         endbytepos = startbytepos + size;
1320         prevbytepos = Insert->PrevBytePos;
1321
1322         *StartPos = XLogBytePosToRecPtr(startbytepos);
1323         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1324
1325         segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1326         if (segleft != wal_segment_size)
1327         {
1328                 /* consume the rest of the segment */
1329                 *EndPos += segleft;
1330                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1331         }
1332         Insert->CurrBytePos = endbytepos;
1333         Insert->PrevBytePos = startbytepos;
1334
1335         SpinLockRelease(&Insert->insertpos_lck);
1336
1337         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1338
1339         Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1340         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1341         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1342         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1343
1344         return true;
1345 }
1346
1347 /*
1348  * Checks whether the current buffer page and backup page stored in the
1349  * WAL record are consistent or not. Before comparing the two pages, a
1350  * masking can be applied to the pages to ignore certain areas like hint bits,
1351  * unused space between pd_lower and pd_upper among other things. This
1352  * function should be called once WAL replay has been completed for a
1353  * given record.
1354  */
1355 static void
1356 checkXLogConsistency(XLogReaderState *record)
1357 {
1358         RmgrId          rmid = XLogRecGetRmid(record);
1359         RelFileNode rnode;
1360         ForkNumber      forknum;
1361         BlockNumber blkno;
1362         int                     block_id;
1363
1364         /* Records with no backup blocks have no need for consistency checks. */
1365         if (!XLogRecHasAnyBlockRefs(record))
1366                 return;
1367
1368         Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1369
1370         for (block_id = 0; block_id <= record->max_block_id; block_id++)
1371         {
1372                 Buffer          buf;
1373                 Page            page;
1374
1375                 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1376                 {
1377                         /*
1378                          * WAL record doesn't contain a block reference with the given id.
1379                          * Do nothing.
1380                          */
1381                         continue;
1382                 }
1383
1384                 Assert(XLogRecHasBlockImage(record, block_id));
1385
1386                 if (XLogRecBlockImageApply(record, block_id))
1387                 {
1388                         /*
1389                          * WAL record has already applied the page, so bypass the
1390                          * consistency check as that would result in comparing the full
1391                          * page stored in the record with itself.
1392                          */
1393                         continue;
1394                 }
1395
1396                 /*
1397                  * Read the contents from the current buffer and store it in a
1398                  * temporary page.
1399                  */
1400                 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1401                                                                          RBM_NORMAL_NO_LOG);
1402                 if (!BufferIsValid(buf))
1403                         continue;
1404
1405                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1406                 page = BufferGetPage(buf);
1407
1408                 /*
1409                  * Take a copy of the local page where WAL has been applied to have a
1410                  * comparison base before masking it...
1411                  */
1412                 memcpy(replay_image_masked, page, BLCKSZ);
1413
1414                 /* No need for this page anymore now that a copy is in. */
1415                 UnlockReleaseBuffer(buf);
1416
1417                 /*
1418                  * If the block LSN is already ahead of this WAL record, we can't
1419                  * expect contents to match.  This can happen if recovery is
1420                  * restarted.
1421                  */
1422                 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1423                         continue;
1424
1425                 /*
1426                  * Read the contents from the backup copy, stored in WAL record and
1427                  * store it in a temporary page. There is no need to allocate a new
1428                  * page here, a local buffer is fine to hold its contents and a mask
1429                  * can be directly applied on it.
1430                  */
1431                 if (!RestoreBlockImage(record, block_id, master_image_masked))
1432                         elog(ERROR, "failed to restore block image");
1433
1434                 /*
1435                  * If masking function is defined, mask both the master and replay
1436                  * images
1437                  */
1438                 if (RmgrTable[rmid].rm_mask != NULL)
1439                 {
1440                         RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1441                         RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1442                 }
1443
1444                 /* Time to compare the master and replay images. */
1445                 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1446                 {
1447                         elog(FATAL,
1448                                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1449                                  rnode.spcNode, rnode.dbNode, rnode.relNode,
1450                                  forknum, blkno);
1451                 }
1452         }
1453 }
1454
1455 /*
1456  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1457  * area in the WAL.
1458  */
1459 static void
1460 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1461                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1462 {
1463         char       *currpos;
1464         int                     freespace;
1465         int                     written;
1466         XLogRecPtr      CurrPos;
1467         XLogPageHeader pagehdr;
1468
1469         /*
1470          * Get a pointer to the right place in the right WAL buffer to start
1471          * inserting to.
1472          */
1473         CurrPos = StartPos;
1474         currpos = GetXLogBuffer(CurrPos);
1475         freespace = INSERT_FREESPACE(CurrPos);
1476
1477         /*
1478          * there should be enough space for at least the first field (xl_tot_len)
1479          * on this page.
1480          */
1481         Assert(freespace >= sizeof(uint32));
1482
1483         /* Copy record data */
1484         written = 0;
1485         while (rdata != NULL)
1486         {
1487                 char       *rdata_data = rdata->data;
1488                 int                     rdata_len = rdata->len;
1489
1490                 while (rdata_len > freespace)
1491                 {
1492                         /*
1493                          * Write what fits on this page, and continue on the next page.
1494                          */
1495                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1496                         memcpy(currpos, rdata_data, freespace);
1497                         rdata_data += freespace;
1498                         rdata_len -= freespace;
1499                         written += freespace;
1500                         CurrPos += freespace;
1501
1502                         /*
1503                          * Get pointer to beginning of next page, and set the xlp_rem_len
1504                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1505                          *
1506                          * It's safe to set the contrecord flag and xlp_rem_len without a
1507                          * lock on the page. All the other flags were already set when the
1508                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1509                          * only backend that needs to set the contrecord flag.
1510                          */
1511                         currpos = GetXLogBuffer(CurrPos);
1512                         pagehdr = (XLogPageHeader) currpos;
1513                         pagehdr->xlp_rem_len = write_len - written;
1514                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1515
1516                         /* skip over the page header */
1517                         if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1518                         {
1519                                 CurrPos += SizeOfXLogLongPHD;
1520                                 currpos += SizeOfXLogLongPHD;
1521                         }
1522                         else
1523                         {
1524                                 CurrPos += SizeOfXLogShortPHD;
1525                                 currpos += SizeOfXLogShortPHD;
1526                         }
1527                         freespace = INSERT_FREESPACE(CurrPos);
1528                 }
1529
1530                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1531                 memcpy(currpos, rdata_data, rdata_len);
1532                 currpos += rdata_len;
1533                 CurrPos += rdata_len;
1534                 freespace -= rdata_len;
1535                 written += rdata_len;
1536
1537                 rdata = rdata->next;
1538         }
1539         Assert(written == write_len);
1540
1541         /*
1542          * If this was an xlog-switch, it's not enough to write the switch record,
1543          * we also have to consume all the remaining space in the WAL segment.  We
1544          * have already reserved that space, but we need to actually fill it.
1545          */
1546         if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1547         {
1548                 /* An xlog-switch record doesn't contain any data besides the header */
1549                 Assert(write_len == SizeOfXLogRecord);
1550
1551                 /* Assert that we did reserve the right amount of space */
1552                 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1553
1554                 /* Use up all the remaining space on the current page */
1555                 CurrPos += freespace;
1556
1557                 /*
1558                  * Cause all remaining pages in the segment to be flushed, leaving the
1559                  * XLog position where it should be, at the start of the next segment.
1560                  * We do this one page at a time, to make sure we don't deadlock
1561                  * against ourselves if wal_buffers < wal_segment_size.
1562                  */
1563                 while (CurrPos < EndPos)
1564                 {
1565                         /*
1566                          * The minimal action to flush the page would be to call
1567                          * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1568                          * AdvanceXLInsertBuffer(...).  The page would be left initialized
1569                          * mostly to zeros, except for the page header (always the short
1570                          * variant, as this is never a segment's first page).
1571                          *
1572                          * The large vistas of zeros are good for compressibility, but the
1573                          * headers interrupting them every XLOG_BLCKSZ (with values that
1574                          * differ from page to page) are not.  The effect varies with
1575                          * compression tool, but bzip2 for instance compresses about an
1576                          * order of magnitude worse if those headers are left in place.
1577                          *
1578                          * Rather than complicating AdvanceXLInsertBuffer itself (which is
1579                          * called in heavily-loaded circumstances as well as this lightly-
1580                          * loaded one) with variant behavior, we just use GetXLogBuffer
1581                          * (which itself calls the two methods we need) to get the pointer
1582                          * and zero most of the page.  Then we just zero the page header.
1583                          */
1584                         currpos = GetXLogBuffer(CurrPos);
1585                         MemSet(currpos, 0, SizeOfXLogShortPHD);
1586
1587                         CurrPos += XLOG_BLCKSZ;
1588                 }
1589         }
1590         else
1591         {
1592                 /* Align the end position, so that the next record starts aligned */
1593                 CurrPos = MAXALIGN64(CurrPos);
1594         }
1595
1596         if (CurrPos != EndPos)
1597                 elog(PANIC, "space reserved for WAL record does not match what was written");
1598 }
1599
1600 /*
1601  * Acquire a WAL insertion lock, for inserting to WAL.
1602  */
1603 static void
1604 WALInsertLockAcquire(void)
1605 {
1606         bool            immed;
1607
1608         /*
1609          * It doesn't matter which of the WAL insertion locks we acquire, so try
1610          * the one we used last time.  If the system isn't particularly busy, it's
1611          * a good bet that it's still available, and it's good to have some
1612          * affinity to a particular lock so that you don't unnecessarily bounce
1613          * cache lines between processes when there's no contention.
1614          *
1615          * If this is the first time through in this backend, pick a lock
1616          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1617          * lot of very short connections.
1618          */
1619         static int      lockToTry = -1;
1620
1621         if (lockToTry == -1)
1622                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1623         MyLockNo = lockToTry;
1624
1625         /*
1626          * The insertingAt value is initially set to 0, as we don't know our
1627          * insert location yet.
1628          */
1629         immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1630         if (!immed)
1631         {
1632                 /*
1633                  * If we couldn't get the lock immediately, try another lock next
1634                  * time.  On a system with more insertion locks than concurrent
1635                  * inserters, this causes all the inserters to eventually migrate to a
1636                  * lock that no-one else is using.  On a system with more inserters
1637                  * than locks, it still helps to distribute the inserters evenly
1638                  * across the locks.
1639                  */
1640                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1641         }
1642 }
1643
1644 /*
1645  * Acquire all WAL insertion locks, to prevent other backends from inserting
1646  * to WAL.
1647  */
1648 static void
1649 WALInsertLockAcquireExclusive(void)
1650 {
1651         int                     i;
1652
1653         /*
1654          * When holding all the locks, all but the last lock's insertingAt
1655          * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1656          * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1657          */
1658         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1659         {
1660                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1661                 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1662                                                 &WALInsertLocks[i].l.insertingAt,
1663                                                 PG_UINT64_MAX);
1664         }
1665         /* Variable value reset to 0 at release */
1666         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1667
1668         holdingAllLocks = true;
1669 }
1670
1671 /*
1672  * Release our insertion lock (or locks, if we're holding them all).
1673  *
1674  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1675  * next time the lock is acquired.
1676  */
1677 static void
1678 WALInsertLockRelease(void)
1679 {
1680         if (holdingAllLocks)
1681         {
1682                 int                     i;
1683
1684                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1685                         LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1686                                                                   &WALInsertLocks[i].l.insertingAt,
1687                                                                   0);
1688
1689                 holdingAllLocks = false;
1690         }
1691         else
1692         {
1693                 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1694                                                           &WALInsertLocks[MyLockNo].l.insertingAt,
1695                                                           0);
1696         }
1697 }
1698
1699 /*
1700  * Update our insertingAt value, to let others know that we've finished
1701  * inserting up to that point.
1702  */
1703 static void
1704 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1705 {
1706         if (holdingAllLocks)
1707         {
1708                 /*
1709                  * We use the last lock to mark our actual position, see comments in
1710                  * WALInsertLockAcquireExclusive.
1711                  */
1712                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1713                                                 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1714                                                 insertingAt);
1715         }
1716         else
1717                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1718                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1719                                                 insertingAt);
1720 }
1721
1722 /*
1723  * Wait for any WAL insertions < upto to finish.
1724  *
1725  * Returns the location of the oldest insertion that is still in-progress.
1726  * Any WAL prior to that point has been fully copied into WAL buffers, and
1727  * can be flushed out to disk. Because this waits for any insertions older
1728  * than 'upto' to finish, the return value is always >= 'upto'.
1729  *
1730  * Note: When you are about to write out WAL, you must call this function
1731  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1732  * need to wait for an insertion to finish (or at least advance to next
1733  * uninitialized page), and the inserter might need to evict an old WAL buffer
1734  * to make room for a new one, which in turn requires WALWriteLock.
1735  */
1736 static XLogRecPtr
1737 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1738 {
1739         uint64          bytepos;
1740         XLogRecPtr      reservedUpto;
1741         XLogRecPtr      finishedUpto;
1742         XLogCtlInsert *Insert = &XLogCtl->Insert;
1743         int                     i;
1744
1745         if (MyProc == NULL)
1746                 elog(PANIC, "cannot wait without a PGPROC structure");
1747
1748         /* Read the current insert position */
1749         SpinLockAcquire(&Insert->insertpos_lck);
1750         bytepos = Insert->CurrBytePos;
1751         SpinLockRelease(&Insert->insertpos_lck);
1752         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1753
1754         /*
1755          * No-one should request to flush a piece of WAL that hasn't even been
1756          * reserved yet. However, it can happen if there is a block with a bogus
1757          * LSN on disk, for example. XLogFlush checks for that situation and
1758          * complains, but only after the flush. Here we just assume that to mean
1759          * that all WAL that has been reserved needs to be finished. In this
1760          * corner-case, the return value can be smaller than 'upto' argument.
1761          */
1762         if (upto > reservedUpto)
1763         {
1764                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1765                          (uint32) (upto >> 32), (uint32) upto,
1766                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1767                 upto = reservedUpto;
1768         }
1769
1770         /*
1771          * Loop through all the locks, sleeping on any in-progress insert older
1772          * than 'upto'.
1773          *
1774          * finishedUpto is our return value, indicating the point upto which all
1775          * the WAL insertions have been finished. Initialize it to the head of
1776          * reserved WAL, and as we iterate through the insertion locks, back it
1777          * out for any insertion that's still in progress.
1778          */
1779         finishedUpto = reservedUpto;
1780         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1781         {
1782                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1783
1784                 do
1785                 {
1786                         /*
1787                          * See if this insertion is in progress. LWLockWait will wait for
1788                          * the lock to be released, or for the 'value' to be set by a
1789                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1790                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1791                          * where it's inserting yet.  We will have to wait for it.  If
1792                          * it's a small insertion, the record will most likely fit on the
1793                          * same page and the inserter will release the lock without ever
1794                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1795                          * advertise the insertion point with LWLockUpdateVar before
1796                          * sleeping.
1797                          */
1798                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1799                                                                  &WALInsertLocks[i].l.insertingAt,
1800                                                                  insertingat, &insertingat))
1801                         {
1802                                 /* the lock was free, so no insertion in progress */
1803                                 insertingat = InvalidXLogRecPtr;
1804                                 break;
1805                         }
1806
1807                         /*
1808                          * This insertion is still in progress. Have to wait, unless the
1809                          * inserter has proceeded past 'upto'.
1810                          */
1811                 } while (insertingat < upto);
1812
1813                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1814                         finishedUpto = insertingat;
1815         }
1816         return finishedUpto;
1817 }
1818
1819 /*
1820  * Get a pointer to the right location in the WAL buffer containing the
1821  * given XLogRecPtr.
1822  *
1823  * If the page is not initialized yet, it is initialized. That might require
1824  * evicting an old dirty buffer from the buffer cache, which means I/O.
1825  *
1826  * The caller must ensure that the page containing the requested location
1827  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1828  * hold onto a WAL insertion lock with the insertingAt position set to
1829  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1830  * to evict an old page from the buffer. (This means that once you call
1831  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1832  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1833  * later, because older buffers might be recycled already)
1834  */
1835 static char *
1836 GetXLogBuffer(XLogRecPtr ptr)
1837 {
1838         int                     idx;
1839         XLogRecPtr      endptr;
1840         static uint64 cachedPage = 0;
1841         static char *cachedPos = NULL;
1842         XLogRecPtr      expectedEndPtr;
1843
1844         /*
1845          * Fast path for the common case that we need to access again the same
1846          * page as last time.
1847          */
1848         if (ptr / XLOG_BLCKSZ == cachedPage)
1849         {
1850                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1851                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1852                 return cachedPos + ptr % XLOG_BLCKSZ;
1853         }
1854
1855         /*
1856          * The XLog buffer cache is organized so that a page is always loaded to a
1857          * particular buffer.  That way we can easily calculate the buffer a given
1858          * page must be loaded into, from the XLogRecPtr alone.
1859          */
1860         idx = XLogRecPtrToBufIdx(ptr);
1861
1862         /*
1863          * See what page is loaded in the buffer at the moment. It could be the
1864          * page we're looking for, or something older. It can't be anything newer
1865          * - that would imply the page we're looking for has already been written
1866          * out to disk and evicted, and the caller is responsible for making sure
1867          * that doesn't happen.
1868          *
1869          * However, we don't hold a lock while we read the value. If someone has
1870          * just initialized the page, it's possible that we get a "torn read" of
1871          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1872          * that case we will see a bogus value. That's ok, we'll grab the mapping
1873          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1874          * the page we're looking for. But it means that when we do this unlocked
1875          * read, we might see a value that appears to be ahead of the page we're
1876          * looking for. Don't PANIC on that, until we've verified the value while
1877          * holding the lock.
1878          */
1879         expectedEndPtr = ptr;
1880         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1881
1882         endptr = XLogCtl->xlblocks[idx];
1883         if (expectedEndPtr != endptr)
1884         {
1885                 XLogRecPtr      initializedUpto;
1886
1887                 /*
1888                  * Before calling AdvanceXLInsertBuffer(), which can block, let others
1889                  * know how far we're finished with inserting the record.
1890                  *
1891                  * NB: If 'ptr' points to just after the page header, advertise a
1892                  * position at the beginning of the page rather than 'ptr' itself. If
1893                  * there are no other insertions running, someone might try to flush
1894                  * up to our advertised location. If we advertised a position after
1895                  * the page header, someone might try to flush the page header, even
1896                  * though page might actually not be initialized yet. As the first
1897                  * inserter on the page, we are effectively responsible for making
1898                  * sure that it's initialized, before we let insertingAt to move past
1899                  * the page header.
1900                  */
1901                 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1902                         XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1903                         initializedUpto = ptr - SizeOfXLogShortPHD;
1904                 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1905                                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1906                         initializedUpto = ptr - SizeOfXLogLongPHD;
1907                 else
1908                         initializedUpto = ptr;
1909
1910                 WALInsertLockUpdateInsertingAt(initializedUpto);
1911
1912                 AdvanceXLInsertBuffer(ptr, false);
1913                 endptr = XLogCtl->xlblocks[idx];
1914
1915                 if (expectedEndPtr != endptr)
1916                         elog(PANIC, "could not find WAL buffer for %X/%X",
1917                                  (uint32) (ptr >> 32), (uint32) ptr);
1918         }
1919         else
1920         {
1921                 /*
1922                  * Make sure the initialization of the page is visible to us, and
1923                  * won't arrive later to overwrite the WAL data we write on the page.
1924                  */
1925                 pg_memory_barrier();
1926         }
1927
1928         /*
1929          * Found the buffer holding this page. Return a pointer to the right
1930          * offset within the page.
1931          */
1932         cachedPage = ptr / XLOG_BLCKSZ;
1933         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1934
1935         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1936         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1937
1938         return cachedPos + ptr % XLOG_BLCKSZ;
1939 }
1940
1941 /*
1942  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1943  * is the position starting from the beginning of WAL, excluding all WAL
1944  * page headers.
1945  */
1946 static XLogRecPtr
1947 XLogBytePosToRecPtr(uint64 bytepos)
1948 {
1949         uint64          fullsegs;
1950         uint64          fullpages;
1951         uint64          bytesleft;
1952         uint32          seg_offset;
1953         XLogRecPtr      result;
1954
1955         fullsegs = bytepos / UsableBytesInSegment;
1956         bytesleft = bytepos % UsableBytesInSegment;
1957
1958         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1959         {
1960                 /* fits on first page of segment */
1961                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1962         }
1963         else
1964         {
1965                 /* account for the first page on segment with long header */
1966                 seg_offset = XLOG_BLCKSZ;
1967                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1968
1969                 fullpages = bytesleft / UsableBytesInPage;
1970                 bytesleft = bytesleft % UsableBytesInPage;
1971
1972                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1973         }
1974
1975         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1976
1977         return result;
1978 }
1979
1980 /*
1981  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1982  * returns a pointer to the beginning of the page (ie. before page header),
1983  * not to where the first xlog record on that page would go to. This is used
1984  * when converting a pointer to the end of a record.
1985  */
1986 static XLogRecPtr
1987 XLogBytePosToEndRecPtr(uint64 bytepos)
1988 {
1989         uint64          fullsegs;
1990         uint64          fullpages;
1991         uint64          bytesleft;
1992         uint32          seg_offset;
1993         XLogRecPtr      result;
1994
1995         fullsegs = bytepos / UsableBytesInSegment;
1996         bytesleft = bytepos % UsableBytesInSegment;
1997
1998         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1999         {
2000                 /* fits on first page of segment */
2001                 if (bytesleft == 0)
2002                         seg_offset = 0;
2003                 else
2004                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2005         }
2006         else
2007         {
2008                 /* account for the first page on segment with long header */
2009                 seg_offset = XLOG_BLCKSZ;
2010                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2011
2012                 fullpages = bytesleft / UsableBytesInPage;
2013                 bytesleft = bytesleft % UsableBytesInPage;
2014
2015                 if (bytesleft == 0)
2016                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2017                 else
2018                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2019         }
2020
2021         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2022
2023         return result;
2024 }
2025
2026 /*
2027  * Convert an XLogRecPtr to a "usable byte position".
2028  */
2029 static uint64
2030 XLogRecPtrToBytePos(XLogRecPtr ptr)
2031 {
2032         uint64          fullsegs;
2033         uint32          fullpages;
2034         uint32          offset;
2035         uint64          result;
2036
2037         XLByteToSeg(ptr, fullsegs, wal_segment_size);
2038
2039         fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2040         offset = ptr % XLOG_BLCKSZ;
2041
2042         if (fullpages == 0)
2043         {
2044                 result = fullsegs * UsableBytesInSegment;
2045                 if (offset > 0)
2046                 {
2047                         Assert(offset >= SizeOfXLogLongPHD);
2048                         result += offset - SizeOfXLogLongPHD;
2049                 }
2050         }
2051         else
2052         {
2053                 result = fullsegs * UsableBytesInSegment +
2054                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2055                         (fullpages - 1) * UsableBytesInPage;    /* full pages */
2056                 if (offset > 0)
2057                 {
2058                         Assert(offset >= SizeOfXLogShortPHD);
2059                         result += offset - SizeOfXLogShortPHD;
2060                 }
2061         }
2062
2063         return result;
2064 }
2065
2066 /*
2067  * Initialize XLOG buffers, writing out old buffers if they still contain
2068  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2069  * true, initialize as many pages as we can without having to write out
2070  * unwritten data. Any new pages are initialized to zeros, with pages headers
2071  * initialized properly.
2072  */
2073 static void
2074 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2075 {
2076         XLogCtlInsert *Insert = &XLogCtl->Insert;
2077         int                     nextidx;
2078         XLogRecPtr      OldPageRqstPtr;
2079         XLogwrtRqst WriteRqst;
2080         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2081         XLogRecPtr      NewPageBeginPtr;
2082         XLogPageHeader NewPage;
2083         int                     npages = 0;
2084
2085         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2086
2087         /*
2088          * Now that we have the lock, check if someone initialized the page
2089          * already.
2090          */
2091         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2092         {
2093                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2094
2095                 /*
2096                  * Get ending-offset of the buffer page we need to replace (this may
2097                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2098                  * already written out.
2099                  */
2100                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2101                 if (LogwrtResult.Write < OldPageRqstPtr)
2102                 {
2103                         /*
2104                          * Nope, got work to do. If we just want to pre-initialize as much
2105                          * as we can without flushing, give up now.
2106                          */
2107                         if (opportunistic)
2108                                 break;
2109
2110                         /* Before waiting, get info_lck and update LogwrtResult */
2111                         SpinLockAcquire(&XLogCtl->info_lck);
2112                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2113                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2114                         LogwrtResult = XLogCtl->LogwrtResult;
2115                         SpinLockRelease(&XLogCtl->info_lck);
2116
2117                         /*
2118                          * Now that we have an up-to-date LogwrtResult value, see if we
2119                          * still need to write it or if someone else already did.
2120                          */
2121                         if (LogwrtResult.Write < OldPageRqstPtr)
2122                         {
2123                                 /*
2124                                  * Must acquire write lock. Release WALBufMappingLock first,
2125                                  * to make sure that all insertions that we need to wait for
2126                                  * can finish (up to this same position). Otherwise we risk
2127                                  * deadlock.
2128                                  */
2129                                 LWLockRelease(WALBufMappingLock);
2130
2131                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2132
2133                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2134
2135                                 LogwrtResult = XLogCtl->LogwrtResult;
2136                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2137                                 {
2138                                         /* OK, someone wrote it already */
2139                                         LWLockRelease(WALWriteLock);
2140                                 }
2141                                 else
2142                                 {
2143                                         /* Have to write it ourselves */
2144                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2145                                         WriteRqst.Write = OldPageRqstPtr;
2146                                         WriteRqst.Flush = 0;
2147                                         XLogWrite(WriteRqst, false);
2148                                         LWLockRelease(WALWriteLock);
2149                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2150                                 }
2151                                 /* Re-acquire WALBufMappingLock and retry */
2152                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2153                                 continue;
2154                         }
2155                 }
2156
2157                 /*
2158                  * Now the next buffer slot is free and we can set it up to be the
2159                  * next output page.
2160                  */
2161                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2162                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2163
2164                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2165
2166                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2167
2168                 /*
2169                  * Be sure to re-zero the buffer so that bytes beyond what we've
2170                  * written will look like zeroes and not valid XLOG records...
2171                  */
2172                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2173
2174                 /*
2175                  * Fill the new page's header
2176                  */
2177                 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2178
2179                 /* NewPage->xlp_info = 0; */    /* done by memset */
2180                 NewPage->xlp_tli = ThisTimeLineID;
2181                 NewPage->xlp_pageaddr = NewPageBeginPtr;
2182
2183                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2184
2185                 /*
2186                  * If online backup is not in progress, mark the header to indicate
2187                  * that WAL records beginning in this page have removable backup
2188                  * blocks.  This allows the WAL archiver to know whether it is safe to
2189                  * compress archived WAL data by transforming full-block records into
2190                  * the non-full-block format.  It is sufficient to record this at the
2191                  * page level because we force a page switch (in fact a segment
2192                  * switch) when starting a backup, so the flag will be off before any
2193                  * records can be written during the backup.  At the end of a backup,
2194                  * the last page will be marked as all unsafe when perhaps only part
2195                  * is unsafe, but at worst the archiver would miss the opportunity to
2196                  * compress a few records.
2197                  */
2198                 if (!Insert->forcePageWrites)
2199                         NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2200
2201                 /*
2202                  * If first page of an XLOG segment file, make it a long header.
2203                  */
2204                 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2205                 {
2206                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2207
2208                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2209                         NewLongPage->xlp_seg_size = wal_segment_size;
2210                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2211                         NewPage->xlp_info |= XLP_LONG_HEADER;
2212                 }
2213
2214                 /*
2215                  * Make sure the initialization of the page becomes visible to others
2216                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2217                  * holding a lock.
2218                  */
2219                 pg_write_barrier();
2220
2221                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2222
2223                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2224
2225                 npages++;
2226         }
2227         LWLockRelease(WALBufMappingLock);
2228
2229 #ifdef WAL_DEBUG
2230         if (XLOG_DEBUG && npages > 0)
2231         {
2232                 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2233                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2234         }
2235 #endif
2236 }
2237
2238 /*
2239  * Calculate CheckPointSegments based on max_wal_size_mb and
2240  * checkpoint_completion_target.
2241  */
2242 static void
2243 CalculateCheckpointSegments(void)
2244 {
2245         double          target;
2246
2247         /*-------
2248          * Calculate the distance at which to trigger a checkpoint, to avoid
2249          * exceeding max_wal_size_mb. This is based on two assumptions:
2250          *
2251          * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2252          *    WAL for two checkpoint cycles to allow us to recover from the
2253          *    secondary checkpoint if the first checkpoint failed, though we
2254          *    only did this on the master anyway, not on standby. Keeping just
2255          *    one checkpoint simplifies processing and reduces disk space in
2256          *    many smaller databases.)
2257          * b) during checkpoint, we consume checkpoint_completion_target *
2258          *        number of segments consumed between checkpoints.
2259          *-------
2260          */
2261         target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2262                 (1.0 + CheckPointCompletionTarget);
2263
2264         /* round down */
2265         CheckPointSegments = (int) target;
2266
2267         if (CheckPointSegments < 1)
2268                 CheckPointSegments = 1;
2269 }
2270
2271 void
2272 assign_max_wal_size(int newval, void *extra)
2273 {
2274         max_wal_size_mb = newval;
2275         CalculateCheckpointSegments();
2276 }
2277
2278 void
2279 assign_checkpoint_completion_target(double newval, void *extra)
2280 {
2281         CheckPointCompletionTarget = newval;
2282         CalculateCheckpointSegments();
2283 }
2284
2285 /*
2286  * At a checkpoint, how many WAL segments to recycle as preallocated future
2287  * XLOG segments? Returns the highest segment that should be preallocated.
2288  */
2289 static XLogSegNo
2290 XLOGfileslop(XLogRecPtr RedoRecPtr)
2291 {
2292         XLogSegNo       minSegNo;
2293         XLogSegNo       maxSegNo;
2294         double          distance;
2295         XLogSegNo       recycleSegNo;
2296
2297         /*
2298          * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2299          * correspond to. Always recycle enough segments to meet the minimum, and
2300          * remove enough segments to stay below the maximum.
2301          */
2302         minSegNo = RedoRecPtr / wal_segment_size +
2303                 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2304         maxSegNo = RedoRecPtr / wal_segment_size +
2305                 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2306
2307         /*
2308          * Between those limits, recycle enough segments to get us through to the
2309          * estimated end of next checkpoint.
2310          *
2311          * To estimate where the next checkpoint will finish, assume that the
2312          * system runs steadily consuming CheckPointDistanceEstimate bytes between
2313          * every checkpoint.
2314          */
2315         distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2316         /* add 10% for good measure. */
2317         distance *= 1.10;
2318
2319         recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + distance) /
2320                                                                         wal_segment_size);
2321
2322         if (recycleSegNo < minSegNo)
2323                 recycleSegNo = minSegNo;
2324         if (recycleSegNo > maxSegNo)
2325                 recycleSegNo = maxSegNo;
2326
2327         return recycleSegNo;
2328 }
2329
2330 /*
2331  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2332  *
2333  * new_segno indicates a log file that has just been filled up (or read
2334  * during recovery). We measure the distance from RedoRecPtr to new_segno
2335  * and see if that exceeds CheckPointSegments.
2336  *
2337  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2338  */
2339 static bool
2340 XLogCheckpointNeeded(XLogSegNo new_segno)
2341 {
2342         XLogSegNo       old_segno;
2343
2344         XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2345
2346         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2347                 return true;
2348         return false;
2349 }
2350
2351 /*
2352  * Write and/or fsync the log at least as far as WriteRqst indicates.
2353  *
2354  * If flexible == true, we don't have to write as far as WriteRqst, but
2355  * may stop at any convenient boundary (such as a cache or logfile boundary).
2356  * This option allows us to avoid uselessly issuing multiple writes when a
2357  * single one would do.
2358  *
2359  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2360  * must be called before grabbing the lock, to make sure the data is ready to
2361  * write.
2362  */
2363 static void
2364 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2365 {
2366         bool            ispartialpage;
2367         bool            last_iteration;
2368         bool            finishing_seg;
2369         bool            use_existent;
2370         int                     curridx;
2371         int                     npages;
2372         int                     startidx;
2373         uint32          startoffset;
2374
2375         /* We should always be inside a critical section here */
2376         Assert(CritSectionCount > 0);
2377
2378         /*
2379          * Update local LogwrtResult (caller probably did this already, but...)
2380          */
2381         LogwrtResult = XLogCtl->LogwrtResult;
2382
2383         /*
2384          * Since successive pages in the xlog cache are consecutively allocated,
2385          * we can usually gather multiple pages together and issue just one
2386          * write() call.  npages is the number of pages we have determined can be
2387          * written together; startidx is the cache block index of the first one,
2388          * and startoffset is the file offset at which it should go. The latter
2389          * two variables are only valid when npages > 0, but we must initialize
2390          * all of them to keep the compiler quiet.
2391          */
2392         npages = 0;
2393         startidx = 0;
2394         startoffset = 0;
2395
2396         /*
2397          * Within the loop, curridx is the cache block index of the page to
2398          * consider writing.  Begin at the buffer containing the next unwritten
2399          * page, or last partially written page.
2400          */
2401         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2402
2403         while (LogwrtResult.Write < WriteRqst.Write)
2404         {
2405                 /*
2406                  * Make sure we're not ahead of the insert process.  This could happen
2407                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2408                  * last page that's been initialized by AdvanceXLInsertBuffer.
2409                  */
2410                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2411
2412                 if (LogwrtResult.Write >= EndPtr)
2413                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2414                                  (uint32) (LogwrtResult.Write >> 32),
2415                                  (uint32) LogwrtResult.Write,
2416                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2417
2418                 /* Advance LogwrtResult.Write to end of current buffer page */
2419                 LogwrtResult.Write = EndPtr;
2420                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2421
2422                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2423                                                          wal_segment_size))
2424                 {
2425                         /*
2426                          * Switch to new logfile segment.  We cannot have any pending
2427                          * pages here (since we dump what we have at segment end).
2428                          */
2429                         Assert(npages == 0);
2430                         if (openLogFile >= 0)
2431                                 XLogFileClose();
2432                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2433                                                         wal_segment_size);
2434
2435                         /* create/use new log file */
2436                         use_existent = true;
2437                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2438                         openLogOff = 0;
2439                 }
2440
2441                 /* Make sure we have the current logfile open */
2442                 if (openLogFile < 0)
2443                 {
2444                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2445                                                         wal_segment_size);
2446                         openLogFile = XLogFileOpen(openLogSegNo);
2447                         openLogOff = 0;
2448                 }
2449
2450                 /* Add current page to the set of pending pages-to-dump */
2451                 if (npages == 0)
2452                 {
2453                         /* first of group */
2454                         startidx = curridx;
2455                         startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2456                                                                                         wal_segment_size);
2457                 }
2458                 npages++;
2459
2460                 /*
2461                  * Dump the set if this will be the last loop iteration, or if we are
2462                  * at the last page of the cache area (since the next page won't be
2463                  * contiguous in memory), or if we are at the end of the logfile
2464                  * segment.
2465                  */
2466                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2467
2468                 finishing_seg = !ispartialpage &&
2469                         (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2470
2471                 if (last_iteration ||
2472                         curridx == XLogCtl->XLogCacheBlck ||
2473                         finishing_seg)
2474                 {
2475                         char       *from;
2476                         Size            nbytes;
2477                         Size            nleft;
2478                         int                     written;
2479
2480                         /* Need to seek in the file? */
2481                         if (openLogOff != startoffset)
2482                         {
2483                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2484                                         ereport(PANIC,
2485                                                         (errcode_for_file_access(),
2486                                                          errmsg("could not seek in log file %s to offset %u: %m",
2487                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2488                                                                         startoffset)));
2489                                 openLogOff = startoffset;
2490                         }
2491
2492                         /* OK to write the page(s) */
2493                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2494                         nbytes = npages * (Size) XLOG_BLCKSZ;
2495                         nleft = nbytes;
2496                         do
2497                         {
2498                                 errno = 0;
2499                                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2500                                 written = write(openLogFile, from, nleft);
2501                                 pgstat_report_wait_end();
2502                                 if (written <= 0)
2503                                 {
2504                                         if (errno == EINTR)
2505                                                 continue;
2506                                         ereport(PANIC,
2507                                                         (errcode_for_file_access(),
2508                                                          errmsg("could not write to log file %s "
2509                                                                         "at offset %u, length %zu: %m",
2510                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2511                                                                         openLogOff, nbytes)));
2512                                 }
2513                                 nleft -= written;
2514                                 from += written;
2515                         } while (nleft > 0);
2516
2517                         /* Update state for write */
2518                         openLogOff += nbytes;
2519                         npages = 0;
2520
2521                         /*
2522                          * If we just wrote the whole last page of a logfile segment,
2523                          * fsync the segment immediately.  This avoids having to go back
2524                          * and re-open prior segments when an fsync request comes along
2525                          * later. Doing it here ensures that one and only one backend will
2526                          * perform this fsync.
2527                          *
2528                          * This is also the right place to notify the Archiver that the
2529                          * segment is ready to copy to archival storage, and to update the
2530                          * timer for archive_timeout, and to signal for a checkpoint if
2531                          * too many logfile segments have been used since the last
2532                          * checkpoint.
2533                          */
2534                         if (finishing_seg)
2535                         {
2536                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2537
2538                                 /* signal that we need to wakeup walsenders later */
2539                                 WalSndWakeupRequest();
2540
2541                                 LogwrtResult.Flush = LogwrtResult.Write;        /* end of page */
2542
2543                                 if (XLogArchivingActive())
2544                                         XLogArchiveNotifySeg(openLogSegNo);
2545
2546                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2547                                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2548
2549                                 /*
2550                                  * Request a checkpoint if we've consumed too much xlog since
2551                                  * the last one.  For speed, we first check using the local
2552                                  * copy of RedoRecPtr, which might be out of date; if it looks
2553                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2554                                  * recheck.
2555                                  */
2556                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2557                                 {
2558                                         (void) GetRedoRecPtr();
2559                                         if (XLogCheckpointNeeded(openLogSegNo))
2560                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2561                                 }
2562                         }
2563                 }
2564
2565                 if (ispartialpage)
2566                 {
2567                         /* Only asked to write a partial page */
2568                         LogwrtResult.Write = WriteRqst.Write;
2569                         break;
2570                 }
2571                 curridx = NextBufIdx(curridx);
2572
2573                 /* If flexible, break out of loop as soon as we wrote something */
2574                 if (flexible && npages == 0)
2575                         break;
2576         }
2577
2578         Assert(npages == 0);
2579
2580         /*
2581          * If asked to flush, do so
2582          */
2583         if (LogwrtResult.Flush < WriteRqst.Flush &&
2584                 LogwrtResult.Flush < LogwrtResult.Write)
2585
2586         {
2587                 /*
2588                  * Could get here without iterating above loop, in which case we might
2589                  * have no open file or the wrong one.  However, we do not need to
2590                  * fsync more than one file.
2591                  */
2592                 if (sync_method != SYNC_METHOD_OPEN &&
2593                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2594                 {
2595                         if (openLogFile >= 0 &&
2596                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2597                                                                  wal_segment_size))
2598                                 XLogFileClose();
2599                         if (openLogFile < 0)
2600                         {
2601                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2602                                                                 wal_segment_size);
2603                                 openLogFile = XLogFileOpen(openLogSegNo);
2604                                 openLogOff = 0;
2605                         }
2606
2607                         issue_xlog_fsync(openLogFile, openLogSegNo);
2608                 }
2609
2610                 /* signal that we need to wakeup walsenders later */
2611                 WalSndWakeupRequest();
2612
2613                 LogwrtResult.Flush = LogwrtResult.Write;
2614         }
2615
2616         /*
2617          * Update shared-memory status
2618          *
2619          * We make sure that the shared 'request' values do not fall behind the
2620          * 'result' values.  This is not absolutely essential, but it saves some
2621          * code in a couple of places.
2622          */
2623         {
2624                 SpinLockAcquire(&XLogCtl->info_lck);
2625                 XLogCtl->LogwrtResult = LogwrtResult;
2626                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2627                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2628                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2629                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2630                 SpinLockRelease(&XLogCtl->info_lck);
2631         }
2632 }
2633
2634 /*
2635  * Record the LSN for an asynchronous transaction commit/abort
2636  * and nudge the WALWriter if there is work for it to do.
2637  * (This should not be called for synchronous commits.)
2638  */
2639 void
2640 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2641 {
2642         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2643         bool            sleeping;
2644
2645         SpinLockAcquire(&XLogCtl->info_lck);
2646         LogwrtResult = XLogCtl->LogwrtResult;
2647         sleeping = XLogCtl->WalWriterSleeping;
2648         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2649                 XLogCtl->asyncXactLSN = asyncXactLSN;
2650         SpinLockRelease(&XLogCtl->info_lck);
2651
2652         /*
2653          * If the WALWriter is sleeping, we should kick it to make it come out of
2654          * low-power mode.  Otherwise, determine whether there's a full page of
2655          * WAL available to write.
2656          */
2657         if (!sleeping)
2658         {
2659                 /* back off to last completed page boundary */
2660                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2661
2662                 /* if we have already flushed that far, we're done */
2663                 if (WriteRqstPtr <= LogwrtResult.Flush)
2664                         return;
2665         }
2666
2667         /*
2668          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2669          * to come out of low-power mode so that this async commit will reach disk
2670          * within the expected amount of time.
2671          */
2672         if (ProcGlobal->walwriterLatch)
2673                 SetLatch(ProcGlobal->walwriterLatch);
2674 }
2675
2676 /*
2677  * Record the LSN up to which we can remove WAL because it's not required by
2678  * any replication slot.
2679  */
2680 void
2681 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2682 {
2683         SpinLockAcquire(&XLogCtl->info_lck);
2684         XLogCtl->replicationSlotMinLSN = lsn;
2685         SpinLockRelease(&XLogCtl->info_lck);
2686 }
2687
2688
2689 /*
2690  * Return the oldest LSN we must retain to satisfy the needs of some
2691  * replication slot.
2692  */
2693 static XLogRecPtr
2694 XLogGetReplicationSlotMinimumLSN(void)
2695 {
2696         XLogRecPtr      retval;
2697
2698         SpinLockAcquire(&XLogCtl->info_lck);
2699         retval = XLogCtl->replicationSlotMinLSN;
2700         SpinLockRelease(&XLogCtl->info_lck);
2701
2702         return retval;
2703 }
2704
2705 /*
2706  * Advance minRecoveryPoint in control file.
2707  *
2708  * If we crash during recovery, we must reach this point again before the
2709  * database is consistent.
2710  *
2711  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2712  * is only updated if it's not already greater than or equal to 'lsn'.
2713  */
2714 static void
2715 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2716 {
2717         /* Quick check using our local copy of the variable */
2718         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2719                 return;
2720
2721         /*
2722          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2723          * i.e., we're doing crash recovery.  We never modify the control file's
2724          * value in that case, so we can short-circuit future checks here too. The
2725          * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2726          * updated until crash recovery finishes.  We only do this for the startup
2727          * process as it should not update its own reference of minRecoveryPoint
2728          * until it has finished crash recovery to make sure that all WAL
2729          * available is replayed in this case.  This also saves from extra locks
2730          * taken on the control file from the startup process.
2731          */
2732         if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2733         {
2734                 updateMinRecoveryPoint = false;
2735                 return;
2736         }
2737
2738         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2739
2740         /* update local copy */
2741         minRecoveryPoint = ControlFile->minRecoveryPoint;
2742         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2743
2744         if (XLogRecPtrIsInvalid(minRecoveryPoint))
2745                 updateMinRecoveryPoint = false;
2746         else if (force || minRecoveryPoint < lsn)
2747         {
2748                 XLogRecPtr      newMinRecoveryPoint;
2749                 TimeLineID      newMinRecoveryPointTLI;
2750
2751                 /*
2752                  * To avoid having to update the control file too often, we update it
2753                  * all the way to the last record being replayed, even though 'lsn'
2754                  * would suffice for correctness.  This also allows the 'force' case
2755                  * to not need a valid 'lsn' value.
2756                  *
2757                  * Another important reason for doing it this way is that the passed
2758                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2759                  * the caller got it from a corrupted heap page.  Accepting such a
2760                  * value as the min recovery point would prevent us from coming up at
2761                  * all.  Instead, we just log a warning and continue with recovery.
2762                  * (See also the comments about corrupt LSNs in XLogFlush.)
2763                  */
2764                 SpinLockAcquire(&XLogCtl->info_lck);
2765                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2766                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2767                 SpinLockRelease(&XLogCtl->info_lck);
2768
2769                 if (!force && newMinRecoveryPoint < lsn)
2770                         elog(WARNING,
2771                                  "xlog min recovery request %X/%X is past current point %X/%X",
2772                                  (uint32) (lsn >> 32), (uint32) lsn,
2773                                  (uint32) (newMinRecoveryPoint >> 32),
2774                                  (uint32) newMinRecoveryPoint);
2775
2776                 /* update control file */
2777                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2778                 {
2779                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2780                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2781                         UpdateControlFile();
2782                         minRecoveryPoint = newMinRecoveryPoint;
2783                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2784
2785                         ereport(DEBUG2,
2786                                         (errmsg("updated min recovery point to %X/%X on timeline %u",
2787                                                         (uint32) (minRecoveryPoint >> 32),
2788                                                         (uint32) minRecoveryPoint,
2789                                                         newMinRecoveryPointTLI)));
2790                 }
2791         }
2792         LWLockRelease(ControlFileLock);
2793 }
2794
2795 /*
2796  * Ensure that all XLOG data through the given position is flushed to disk.
2797  *
2798  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2799  * already held, and we try to avoid acquiring it if possible.
2800  */
2801 void
2802 XLogFlush(XLogRecPtr record)
2803 {
2804         XLogRecPtr      WriteRqstPtr;
2805         XLogwrtRqst WriteRqst;
2806
2807         /*
2808          * During REDO, we are reading not writing WAL.  Therefore, instead of
2809          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2810          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2811          * to act this way too, and because when it tries to write the
2812          * end-of-recovery checkpoint, it should indeed flush.
2813          */
2814         if (!XLogInsertAllowed())
2815         {
2816                 UpdateMinRecoveryPoint(record, false);
2817                 return;
2818         }
2819
2820         /* Quick exit if already known flushed */
2821         if (record <= LogwrtResult.Flush)
2822                 return;
2823
2824 #ifdef WAL_DEBUG
2825         if (XLOG_DEBUG)
2826                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2827                          (uint32) (record >> 32), (uint32) record,
2828                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2829                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2830 #endif
2831
2832         START_CRIT_SECTION();
2833
2834         /*
2835          * Since fsync is usually a horribly expensive operation, we try to
2836          * piggyback as much data as we can on each fsync: if we see any more data
2837          * entered into the xlog buffer, we'll write and fsync that too, so that
2838          * the final value of LogwrtResult.Flush is as large as possible. This
2839          * gives us some chance of avoiding another fsync immediately after.
2840          */
2841
2842         /* initialize to given target; may increase below */
2843         WriteRqstPtr = record;
2844
2845         /*
2846          * Now wait until we get the write lock, or someone else does the flush
2847          * for us.
2848          */
2849         for (;;)
2850         {
2851                 XLogRecPtr      insertpos;
2852
2853                 /* read LogwrtResult and update local state */
2854                 SpinLockAcquire(&XLogCtl->info_lck);
2855                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2856                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2857                 LogwrtResult = XLogCtl->LogwrtResult;
2858                 SpinLockRelease(&XLogCtl->info_lck);
2859
2860                 /* done already? */
2861                 if (record <= LogwrtResult.Flush)
2862                         break;
2863
2864                 /*
2865                  * Before actually performing the write, wait for all in-flight
2866                  * insertions to the pages we're about to write to finish.
2867                  */
2868                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2869
2870                 /*
2871                  * Try to get the write lock. If we can't get it immediately, wait
2872                  * until it's released, and recheck if we still need to do the flush
2873                  * or if the backend that held the lock did it for us already. This
2874                  * helps to maintain a good rate of group committing when the system
2875                  * is bottlenecked by the speed of fsyncing.
2876                  */
2877                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2878                 {
2879                         /*
2880                          * The lock is now free, but we didn't acquire it yet. Before we
2881                          * do, loop back to check if someone else flushed the record for
2882                          * us already.
2883                          */
2884                         continue;
2885                 }
2886
2887                 /* Got the lock; recheck whether request is satisfied */
2888                 LogwrtResult = XLogCtl->LogwrtResult;
2889                 if (record <= LogwrtResult.Flush)
2890                 {
2891                         LWLockRelease(WALWriteLock);
2892                         break;
2893                 }
2894
2895                 /*
2896                  * Sleep before flush! By adding a delay here, we may give further
2897                  * backends the opportunity to join the backlog of group commit
2898                  * followers; this can significantly improve transaction throughput,
2899                  * at the risk of increasing transaction latency.
2900                  *
2901                  * We do not sleep if enableFsync is not turned on, nor if there are
2902                  * fewer than CommitSiblings other backends with active transactions.
2903                  */
2904                 if (CommitDelay > 0 && enableFsync &&
2905                         MinimumActiveBackends(CommitSiblings))
2906                 {
2907                         pg_usleep(CommitDelay);
2908
2909                         /*
2910                          * Re-check how far we can now flush the WAL. It's generally not
2911                          * safe to call WaitXLogInsertionsToFinish while holding
2912                          * WALWriteLock, because an in-progress insertion might need to
2913                          * also grab WALWriteLock to make progress. But we know that all
2914                          * the insertions up to insertpos have already finished, because
2915                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2916                          * We're only calling it again to allow insertpos to be moved
2917                          * further forward, not to actually wait for anyone.
2918                          */
2919                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2920                 }
2921
2922                 /* try to write/flush later additions to XLOG as well */
2923                 WriteRqst.Write = insertpos;
2924                 WriteRqst.Flush = insertpos;
2925
2926                 XLogWrite(WriteRqst, false);
2927
2928                 LWLockRelease(WALWriteLock);
2929                 /* done */
2930                 break;
2931         }
2932
2933         END_CRIT_SECTION();
2934
2935         /* wake up walsenders now that we've released heavily contended locks */
2936         WalSndWakeupProcessRequests();
2937
2938         /*
2939          * If we still haven't flushed to the request point then we have a
2940          * problem; most likely, the requested flush point is past end of XLOG.
2941          * This has been seen to occur when a disk page has a corrupted LSN.
2942          *
2943          * Formerly we treated this as a PANIC condition, but that hurts the
2944          * system's robustness rather than helping it: we do not want to take down
2945          * the whole system due to corruption on one data page.  In particular, if
2946          * the bad page is encountered again during recovery then we would be
2947          * unable to restart the database at all!  (This scenario actually
2948          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2949          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2950          * the only time we can reach here during recovery is while flushing the
2951          * end-of-recovery checkpoint record, and we don't expect that to have a
2952          * bad LSN.
2953          *
2954          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2955          * since xact.c calls this routine inside a critical section.  However,
2956          * calls from bufmgr.c are not within critical sections and so we will not
2957          * force a restart for a bad LSN on a data page.
2958          */
2959         if (LogwrtResult.Flush < record)
2960                 elog(ERROR,
2961                          "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2962                          (uint32) (record >> 32), (uint32) record,
2963                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2964 }
2965
2966 /*
2967  * Write & flush xlog, but without specifying exactly where to.
2968  *
2969  * We normally write only completed blocks; but if there is nothing to do on
2970  * that basis, we check for unwritten async commits in the current incomplete
2971  * block, and write through the latest one of those.  Thus, if async commits
2972  * are not being used, we will write complete blocks only.
2973  *
2974  * If, based on the above, there's anything to write we do so immediately. But
2975  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2976  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2977  * more than wal_writer_flush_after unflushed blocks.
2978  *
2979  * We can guarantee that async commits reach disk after at most three
2980  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2981  * to write "flexibly", meaning it can stop at the end of the buffer ring;
2982  * this makes a difference only with very high load or long wal_writer_delay,
2983  * but imposes one extra cycle for the worst case for async commits.)
2984  *
2985  * This routine is invoked periodically by the background walwriter process.
2986  *
2987  * Returns true if there was any work to do, even if we skipped flushing due
2988  * to wal_writer_delay/wal_writer_flush_after.
2989  */
2990 bool
2991 XLogBackgroundFlush(void)
2992 {
2993         XLogwrtRqst WriteRqst;
2994         bool            flexible = true;
2995         static TimestampTz lastflush;
2996         TimestampTz now;
2997         int                     flushbytes;
2998
2999         /* XLOG doesn't need flushing during recovery */
3000         if (RecoveryInProgress())
3001                 return false;
3002
3003         /* read LogwrtResult and update local state */
3004         SpinLockAcquire(&XLogCtl->info_lck);
3005         LogwrtResult = XLogCtl->LogwrtResult;
3006         WriteRqst = XLogCtl->LogwrtRqst;
3007         SpinLockRelease(&XLogCtl->info_lck);
3008
3009         /* back off to last completed page boundary */
3010         WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3011
3012         /* if we have already flushed that far, consider async commit records */
3013         if (WriteRqst.Write <= LogwrtResult.Flush)
3014         {
3015                 SpinLockAcquire(&XLogCtl->info_lck);
3016                 WriteRqst.Write = XLogCtl->asyncXactLSN;
3017                 SpinLockRelease(&XLogCtl->info_lck);
3018                 flexible = false;               /* ensure it all gets written */
3019         }
3020
3021         /*
3022          * If already known flushed, we're done. Just need to check if we are
3023          * holding an open file handle to a logfile that's no longer in use,
3024          * preventing the file from being deleted.
3025          */
3026         if (WriteRqst.Write <= LogwrtResult.Flush)
3027         {
3028                 if (openLogFile >= 0)
3029                 {
3030                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3031                                                                  wal_segment_size))
3032                         {
3033                                 XLogFileClose();
3034                         }
3035                 }
3036                 return false;
3037         }
3038
3039         /*
3040          * Determine how far to flush WAL, based on the wal_writer_delay and
3041          * wal_writer_flush_after GUCs.
3042          */
3043         now = GetCurrentTimestamp();
3044         flushbytes =
3045                 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3046
3047         if (WalWriterFlushAfter == 0 || lastflush == 0)
3048         {
3049                 /* first call, or block based limits disabled */
3050                 WriteRqst.Flush = WriteRqst.Write;
3051                 lastflush = now;
3052         }
3053         else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3054         {
3055                 /*
3056                  * Flush the writes at least every WalWriteDelay ms. This is important
3057                  * to bound the amount of time it takes for an asynchronous commit to
3058                  * hit disk.
3059                  */
3060                 WriteRqst.Flush = WriteRqst.Write;
3061                 lastflush = now;
3062         }
3063         else if (flushbytes >= WalWriterFlushAfter)
3064         {
3065                 /* exceeded wal_writer_flush_after blocks, flush */
3066                 WriteRqst.Flush = WriteRqst.Write;
3067                 lastflush = now;
3068         }
3069         else
3070         {
3071                 /* no flushing, this time round */
3072                 WriteRqst.Flush = 0;
3073         }
3074
3075 #ifdef WAL_DEBUG
3076         if (XLOG_DEBUG)
3077                 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3078                          (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3079                          (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3080                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3081                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3082 #endif
3083
3084         START_CRIT_SECTION();
3085
3086         /* now wait for any in-progress insertions to finish and get write lock */
3087         WaitXLogInsertionsToFinish(WriteRqst.Write);
3088         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3089         LogwrtResult = XLogCtl->LogwrtResult;
3090         if (WriteRqst.Write > LogwrtResult.Write ||
3091                 WriteRqst.Flush > LogwrtResult.Flush)
3092         {
3093                 XLogWrite(WriteRqst, flexible);
3094         }
3095         LWLockRelease(WALWriteLock);
3096
3097         END_CRIT_SECTION();
3098
3099         /* wake up walsenders now that we've released heavily contended locks */
3100         WalSndWakeupProcessRequests();
3101
3102         /*
3103          * Great, done. To take some work off the critical path, try to initialize
3104          * as many of the no-longer-needed WAL buffers for future use as we can.
3105          */
3106         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3107
3108         /*
3109          * If we determined that we need to write data, but somebody else
3110          * wrote/flushed already, it should be considered as being active, to
3111          * avoid hibernating too early.
3112          */
3113         return true;
3114 }
3115
3116 /*
3117  * Test whether XLOG data has been flushed up to (at least) the given position.
3118  *
3119  * Returns true if a flush is still needed.  (It may be that someone else
3120  * is already in process of flushing that far, however.)
3121  */
3122 bool
3123 XLogNeedsFlush(XLogRecPtr record)
3124 {
3125         /*
3126          * During recovery, we don't flush WAL but update minRecoveryPoint
3127          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3128          * would need to be updated.
3129          */
3130         if (RecoveryInProgress())
3131         {
3132                 /*
3133                  * An invalid minRecoveryPoint means that we need to recover all the
3134                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3135                  * file's value in that case, so we can short-circuit future checks
3136                  * here too.  This triggers a quick exit path for the startup process,
3137                  * which cannot update its local copy of minRecoveryPoint as long as
3138                  * it has not replayed all WAL available when doing crash recovery.
3139                  */
3140                 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3141                         updateMinRecoveryPoint = false;
3142
3143                 /* Quick exit if already known to be updated or cannot be updated */
3144                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3145                         return false;
3146
3147                 /*
3148                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3149                  * just return a conservative guess.
3150                  */
3151                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3152                         return true;
3153                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3154                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3155                 LWLockRelease(ControlFileLock);
3156
3157                 /*
3158                  * Check minRecoveryPoint for any other process than the startup
3159                  * process doing crash recovery, which should not update the control
3160                  * file value if crash recovery is still running.
3161                  */
3162                 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3163                         updateMinRecoveryPoint = false;
3164
3165                 /* check again */
3166                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3167                         return false;
3168                 else
3169                         return true;
3170         }
3171
3172         /* Quick exit if already known flushed */
3173         if (record <= LogwrtResult.Flush)
3174                 return false;
3175
3176         /* read LogwrtResult and update local state */
3177         SpinLockAcquire(&XLogCtl->info_lck);
3178         LogwrtResult = XLogCtl->LogwrtResult;
3179         SpinLockRelease(&XLogCtl->info_lck);
3180
3181         /* check again */
3182         if (record <= LogwrtResult.Flush)
3183                 return false;
3184
3185         return true;
3186 }
3187
3188 /*
3189  * Create a new XLOG file segment, or open a pre-existing one.
3190  *
3191  * log, seg: identify segment to be created/opened.
3192  *
3193  * *use_existent: if true, OK to use a pre-existing file (else, any
3194  * pre-existing file will be deleted).  On return, true if a pre-existing
3195  * file was used.
3196  *
3197  * use_lock: if true, acquire ControlFileLock while moving file into
3198  * place.  This should be true except during bootstrap log creation.  The
3199  * caller must *not* hold the lock at call.
3200  *
3201  * Returns FD of opened file.
3202  *
3203  * Note: errors here are ERROR not PANIC because we might or might not be
3204  * inside a critical section (eg, during checkpoint there is no reason to
3205  * take down the system on failure).  They will promote to PANIC if we are
3206  * in a critical section.
3207  */
3208 int
3209 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3210 {
3211         char            path[MAXPGPATH];
3212         char            tmppath[MAXPGPATH];
3213         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
3214         char       *zbuffer;
3215         XLogSegNo       installed_segno;
3216         XLogSegNo       max_segno;
3217         int                     fd;
3218         int                     nbytes;
3219
3220         XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3221
3222         /*
3223          * Try to use existent file (checkpoint maker may have created it already)
3224          */
3225         if (*use_existent)
3226         {
3227                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3228                 if (fd < 0)
3229                 {
3230                         if (errno != ENOENT)
3231                                 ereport(ERROR,
3232                                                 (errcode_for_file_access(),
3233                                                  errmsg("could not open file \"%s\": %m", path)));
3234                 }
3235                 else
3236                         return fd;
3237         }
3238
3239         /*
3240          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3241          * another process is doing the same thing.  If so, we will end up
3242          * pre-creating an extra log segment.  That seems OK, and better than
3243          * holding the lock throughout this lengthy process.
3244          */
3245         elog(DEBUG2, "creating and filling new WAL file");
3246
3247         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3248
3249         unlink(tmppath);
3250
3251         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3252         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3253         if (fd < 0)
3254                 ereport(ERROR,
3255                                 (errcode_for_file_access(),
3256                                  errmsg("could not create file \"%s\": %m", tmppath)));
3257
3258         /*
3259          * Zero-fill the file.  We have to do this the hard way to ensure that all
3260          * the file space has really been allocated --- on platforms that allow
3261          * "holes" in files, just seeking to the end doesn't allocate intermediate
3262          * space.  This way, we know that we have all the space and (after the
3263          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3264          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3265          * log file.
3266          *
3267          * Note: ensure the buffer is reasonably well-aligned; this may save a few
3268          * cycles transferring data to the kernel.
3269          */
3270         zbuffer = (char *) MAXALIGN(zbuffer_raw);
3271         memset(zbuffer, 0, XLOG_BLCKSZ);
3272         for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3273         {
3274                 errno = 0;
3275                 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3276                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3277                 {
3278                         int                     save_errno = errno;
3279
3280                         /*
3281                          * If we fail to make the file, delete it to release disk space
3282                          */
3283                         unlink(tmppath);
3284
3285                         close(fd);
3286
3287                         /* if write didn't set errno, assume problem is no disk space */
3288                         errno = save_errno ? save_errno : ENOSPC;
3289
3290                         ereport(ERROR,
3291                                         (errcode_for_file_access(),
3292                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3293                 }
3294                 pgstat_report_wait_end();
3295         }
3296
3297         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3298         if (pg_fsync(fd) != 0)
3299         {
3300                 int                     save_errno = errno;
3301
3302                 close(fd);
3303                 errno = save_errno;
3304                 ereport(ERROR,
3305                                 (errcode_for_file_access(),
3306                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3307         }
3308         pgstat_report_wait_end();
3309
3310         if (close(fd))
3311                 ereport(ERROR,
3312                                 (errcode_for_file_access(),
3313                                  errmsg("could not close file \"%s\": %m", tmppath)));
3314
3315         /*
3316          * Now move the segment into place with its final name.
3317          *
3318          * If caller didn't want to use a pre-existing file, get rid of any
3319          * pre-existing file.  Otherwise, cope with possibility that someone else
3320          * has created the file while we were filling ours: if so, use ours to
3321          * pre-create a future log segment.
3322          */
3323         installed_segno = logsegno;
3324
3325         /*
3326          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3327          * that was a constant, but that was always a bit dubious: normally, at a
3328          * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3329          * here, it was the offset from the insert location. We can't do the
3330          * normal XLOGfileslop calculation here because we don't have access to
3331          * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3332          * CheckPointSegments.
3333          */
3334         max_segno = logsegno + CheckPointSegments;
3335         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3336                                                                 *use_existent, max_segno,
3337                                                                 use_lock))
3338         {
3339                 /*
3340                  * No need for any more future segments, or InstallXLogFileSegment()
3341                  * failed to rename the file into place. If the rename failed, opening
3342                  * the file below will fail.
3343                  */
3344                 unlink(tmppath);
3345         }
3346
3347         /* Set flag to tell caller there was no existent file */
3348         *use_existent = false;
3349
3350         /* Now open original target segment (might not be file I just made) */
3351         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3352         if (fd < 0)
3353                 ereport(ERROR,
3354                                 (errcode_for_file_access(),
3355                                  errmsg("could not open file \"%s\": %m", path)));
3356
3357         elog(DEBUG2, "done creating and filling new WAL file");
3358
3359         return fd;
3360 }
3361
3362 /*
3363  * Create a new XLOG file segment by copying a pre-existing one.
3364  *
3365  * destsegno: identify segment to be created.
3366  *
3367  * srcTLI, srcsegno: identify segment to be copied (could be from
3368  *              a different timeline)
3369  *
3370  * upto: how much of the source file to copy (the rest is filled with
3371  *              zeros)
3372  *
3373  * Currently this is only used during recovery, and so there are no locking
3374  * considerations.  But we should be just as tense as XLogFileInit to avoid
3375  * emplacing a bogus file.
3376  */
3377 static void
3378 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3379                          int upto)
3380 {
3381         char            path[MAXPGPATH];
3382         char            tmppath[MAXPGPATH];
3383         char            buffer[XLOG_BLCKSZ];
3384         int                     srcfd;
3385         int                     fd;
3386         int                     nbytes;
3387
3388         /*
3389          * Open the source file
3390          */
3391         XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3392         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3393         if (srcfd < 0)
3394                 ereport(ERROR,
3395                                 (errcode_for_file_access(),
3396                                  errmsg("could not open file \"%s\": %m", path)));
3397
3398         /*
3399          * Copy into a temp file name.
3400          */
3401         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3402
3403         unlink(tmppath);
3404
3405         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3406         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3407         if (fd < 0)
3408                 ereport(ERROR,
3409                                 (errcode_for_file_access(),
3410                                  errmsg("could not create file \"%s\": %m", tmppath)));
3411
3412         /*
3413          * Do the data copying.
3414          */
3415         for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3416         {
3417                 int                     nread;
3418
3419                 nread = upto - nbytes;
3420
3421                 /*
3422                  * The part that is not read from the source file is filled with
3423                  * zeros.
3424                  */
3425                 if (nread < sizeof(buffer))
3426                         memset(buffer, 0, sizeof(buffer));
3427
3428                 if (nread > 0)
3429                 {
3430                         int                     r;
3431
3432                         if (nread > sizeof(buffer))
3433                                 nread = sizeof(buffer);
3434                         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3435                         r = read(srcfd, buffer, nread);
3436                         if (r != nread)
3437                         {
3438                                 if (r < 0)
3439                                         ereport(ERROR,
3440                                                         (errcode_for_file_access(),
3441                                                          errmsg("could not read file \"%s\": %m",
3442                                                                         path)));
3443                                 else
3444                                         ereport(ERROR,
3445                                                         (errcode(ERRCODE_DATA_CORRUPTED),
3446                                                          errmsg("could not read file \"%s\": read %d of %zu",
3447                                                                         path, r, (Size) nread)));
3448                         }
3449                         pgstat_report_wait_end();
3450                 }
3451                 errno = 0;
3452                 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3453                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3454                 {
3455                         int                     save_errno = errno;
3456
3457                         /*
3458                          * If we fail to make the file, delete it to release disk space
3459                          */
3460                         unlink(tmppath);
3461                         /* if write didn't set errno, assume problem is no disk space */
3462                         errno = save_errno ? save_errno : ENOSPC;
3463
3464                         ereport(ERROR,
3465                                         (errcode_for_file_access(),
3466                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3467                 }
3468                 pgstat_report_wait_end();
3469         }
3470
3471         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3472         if (pg_fsync(fd) != 0)
3473                 ereport(ERROR,
3474                                 (errcode_for_file_access(),
3475                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3476         pgstat_report_wait_end();
3477
3478         if (CloseTransientFile(fd))
3479                 ereport(ERROR,
3480                                 (errcode_for_file_access(),
3481                                  errmsg("could not close file \"%s\": %m", tmppath)));
3482
3483         CloseTransientFile(srcfd);
3484
3485         /*
3486          * Now move the segment into place with its final name.
3487          */
3488         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3489                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3490 }
3491
3492 /*
3493  * Install a new XLOG segment file as a current or future log segment.
3494  *
3495  * This is used both to install a newly-created segment (which has a temp
3496  * filename while it's being created) and to recycle an old segment.
3497  *
3498  * *segno: identify segment to install as (or first possible target).
3499  * When find_free is true, this is modified on return to indicate the
3500  * actual installation location or last segment searched.
3501  *
3502  * tmppath: initial name of file to install.  It will be renamed into place.
3503  *
3504  * find_free: if true, install the new segment at the first empty segno
3505  * number at or after the passed numbers.  If false, install the new segment
3506  * exactly where specified, deleting any existing segment file there.
3507  *
3508  * max_segno: maximum segment number to install the new file as.  Fail if no
3509  * free slot is found between *segno and max_segno. (Ignored when find_free
3510  * is false.)
3511  *
3512  * use_lock: if true, acquire ControlFileLock while moving file into
3513  * place.  This should be true except during bootstrap log creation.  The
3514  * caller must *not* hold the lock at call.
3515  *
3516  * Returns true if the file was installed successfully.  false indicates that
3517  * max_segno limit was exceeded, or an error occurred while renaming the
3518  * file into place.
3519  */
3520 static bool
3521 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3522                                            bool find_free, XLogSegNo max_segno,
3523                                            bool use_lock)
3524 {
3525         char            path[MAXPGPATH];
3526         struct stat stat_buf;
3527
3528         XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3529
3530         /*
3531          * We want to be sure that only one process does this at a time.
3532          */
3533         if (use_lock)
3534                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3535
3536         if (!find_free)
3537         {
3538                 /* Force installation: get rid of any pre-existing segment file */
3539                 durable_unlink(path, DEBUG1);
3540         }
3541         else
3542         {
3543                 /* Find a free slot to put it in */
3544                 while (stat(path, &stat_buf) == 0)
3545                 {
3546                         if ((*segno) >= max_segno)
3547                         {
3548                                 /* Failed to find a free slot within specified range */
3549                                 if (use_lock)
3550                                         LWLockRelease(ControlFileLock);
3551                                 return false;
3552                         }
3553                         (*segno)++;
3554                         XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3555                 }
3556         }
3557
3558         /*
3559          * Perform the rename using link if available, paranoidly trying to avoid
3560          * overwriting an existing file (there shouldn't be one).
3561          */
3562         if (durable_link_or_rename(tmppath, path, LOG) != 0)
3563         {
3564                 if (use_lock)
3565                         LWLockRelease(ControlFileLock);
3566                 /* durable_link_or_rename already emitted log message */
3567                 return false;
3568         }
3569
3570         if (use_lock)
3571                 LWLockRelease(ControlFileLock);
3572
3573         return true;
3574 }
3575
3576 /*
3577  * Open a pre-existing logfile segment for writing.
3578  */
3579 int
3580 XLogFileOpen(XLogSegNo segno)
3581 {
3582         char            path[MAXPGPATH];
3583         int                     fd;
3584
3585         XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3586
3587         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3588         if (fd < 0)
3589                 ereport(PANIC,
3590                                 (errcode_for_file_access(),
3591                                  errmsg("could not open file \"%s\": %m", path)));
3592
3593         return fd;
3594 }
3595
3596 /*
3597  * Open a logfile segment for reading (during recovery).
3598  *
3599  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3600  * Otherwise, it's assumed to be already available in pg_wal.
3601  */
3602 static int
3603 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3604                          int source, bool notfoundOk)
3605 {
3606         char            xlogfname[MAXFNAMELEN];
3607         char            activitymsg[MAXFNAMELEN + 16];
3608         char            path[MAXPGPATH];
3609         int                     fd;
3610
3611         XLogFileName(xlogfname, tli, segno, wal_segment_size);
3612
3613         switch (source)
3614         {
3615                 case XLOG_FROM_ARCHIVE:
3616                         /* Report recovery progress in PS display */
3617                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3618                                          xlogfname);
3619                         set_ps_display(activitymsg, false);
3620
3621                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3622                                                                                                           "RECOVERYXLOG",
3623                                                                                                           wal_segment_size,
3624                                                                                                           InRedo);
3625                         if (!restoredFromArchive)
3626                                 return -1;
3627                         break;
3628
3629                 case XLOG_FROM_PG_WAL:
3630                 case XLOG_FROM_STREAM:
3631                         XLogFilePath(path, tli, segno, wal_segment_size);
3632                         restoredFromArchive = false;
3633                         break;
3634
3635                 default:
3636                         elog(ERROR, "invalid XLogFileRead source %d", source);
3637         }
3638
3639         /*
3640          * If the segment was fetched from archival storage, replace the existing
3641          * xlog segment (if any) with the archival version.
3642          */
3643         if (source == XLOG_FROM_ARCHIVE)
3644         {
3645                 KeepFileRestoredFromArchive(path, xlogfname);
3646
3647                 /*
3648                  * Set path to point at the new file in pg_wal.
3649                  */
3650                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3651         }
3652
3653         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3654         if (fd >= 0)
3655         {
3656                 /* Success! */
3657                 curFileTLI = tli;
3658
3659                 /* Report recovery progress in PS display */
3660                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3661                                  xlogfname);
3662                 set_ps_display(activitymsg, false);
3663
3664                 /* Track source of data in assorted state variables */
3665                 readSource = source;
3666                 XLogReceiptSource = source;
3667                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3668                 if (source != XLOG_FROM_STREAM)
3669                         XLogReceiptTime = GetCurrentTimestamp();
3670
3671                 return fd;
3672         }
3673         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3674                 ereport(PANIC,
3675                                 (errcode_for_file_access(),
3676                                  errmsg("could not open file \"%s\": %m", path)));
3677         return -1;
3678 }
3679
3680 /*
3681  * Open a logfile segment for reading (during recovery).
3682  *
3683  * This version searches for the segment with any TLI listed in expectedTLEs.
3684  */
3685 static int
3686 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3687 {
3688         char            path[MAXPGPATH];
3689         ListCell   *cell;
3690         int                     fd;
3691         List       *tles;
3692
3693         /*
3694          * Loop looking for a suitable timeline ID: we might need to read any of
3695          * the timelines listed in expectedTLEs.
3696          *
3697          * We expect curFileTLI on entry to be the TLI of the preceding file in
3698          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3699          * to go backwards; this prevents us from picking up the wrong file when a
3700          * parent timeline extends to higher segment numbers than the child we
3701          * want to read.
3702          *
3703          * If we haven't read the timeline history file yet, read it now, so that
3704          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3705          * however, unless we actually find a valid segment.  That way if there is
3706          * neither a timeline history file nor a WAL segment in the archive, and
3707          * streaming replication is set up, we'll read the timeline history file
3708          * streamed from the master when we start streaming, instead of recovering
3709          * with a dummy history generated here.
3710          */
3711         if (expectedTLEs)
3712                 tles = expectedTLEs;
3713         else
3714                 tles = readTimeLineHistory(recoveryTargetTLI);
3715
3716         foreach(cell, tles)
3717         {
3718                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3719
3720                 if (tli < curFileTLI)
3721                         break;                          /* don't bother looking at too-old TLIs */
3722
3723                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3724                 {
3725                         fd = XLogFileRead(segno, emode, tli,
3726                                                           XLOG_FROM_ARCHIVE, true);
3727                         if (fd != -1)
3728                         {
3729                                 elog(DEBUG1, "got WAL segment from archive");
3730                                 if (!expectedTLEs)
3731                                         expectedTLEs = tles;
3732                                 return fd;
3733                         }
3734                 }
3735
3736                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3737                 {
3738                         fd = XLogFileRead(segno, emode, tli,
3739                                                           XLOG_FROM_PG_WAL, true);
3740                         if (fd != -1)
3741                         {
3742                                 if (!expectedTLEs)
3743                                         expectedTLEs = tles;
3744                                 return fd;
3745                         }
3746                 }
3747         }
3748
3749         /* Couldn't find it.  For simplicity, complain about front timeline */
3750         XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3751         errno = ENOENT;
3752         ereport(emode,
3753                         (errcode_for_file_access(),
3754                          errmsg("could not open file \"%s\": %m", path)));
3755         return -1;
3756 }
3757
3758 /*
3759  * Close the current logfile segment for writing.
3760  */
3761 static void
3762 XLogFileClose(void)
3763 {
3764         Assert(openLogFile >= 0);
3765
3766         /*
3767          * WAL segment files will not be re-read in normal operation, so we advise
3768          * the OS to release any cached pages.  But do not do so if WAL archiving
3769          * or streaming is active, because archiver and walsender process could
3770          * use the cache to read the WAL segment.
3771          */
3772 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3773         if (!XLogIsNeeded())
3774                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3775 #endif
3776
3777         if (close(openLogFile))
3778                 ereport(PANIC,
3779                                 (errcode_for_file_access(),
3780                                  errmsg("could not close file \"%s\": %m",
3781                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3782         openLogFile = -1;
3783 }
3784
3785 /*
3786  * Preallocate log files beyond the specified log endpoint.
3787  *
3788  * XXX this is currently extremely conservative, since it forces only one
3789  * future log segment to exist, and even that only if we are 75% done with
3790  * the current one.  This is only appropriate for very low-WAL-volume systems.
3791  * High-volume systems will be OK once they've built up a sufficient set of
3792  * recycled log segments, but the startup transient is likely to include
3793  * a lot of segment creations by foreground processes, which is not so good.
3794  */
3795 static void
3796 PreallocXlogFiles(XLogRecPtr endptr)
3797 {
3798         XLogSegNo       _logSegNo;
3799         int                     lf;
3800         bool            use_existent;
3801         uint64          offset;
3802
3803         XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3804         offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3805         if (offset >= (uint32) (0.75 * wal_segment_size))
3806         {
3807                 _logSegNo++;
3808                 use_existent = true;
3809                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3810                 close(lf);
3811                 if (!use_existent)
3812                         CheckpointStats.ckpt_segs_added++;
3813         }
3814 }
3815
3816 /*
3817  * Throws an error if the given log segment has already been removed or
3818  * recycled. The caller should only pass a segment that it knows to have
3819  * existed while the server has been running, as this function always
3820  * succeeds if no WAL segments have been removed since startup.
3821  * 'tli' is only used in the error message.
3822  *
3823  * Note: this function guarantees to keep errno unchanged on return.
3824  * This supports callers that use this to possibly deliver a better
3825  * error message about a missing file, while still being able to throw
3826  * a normal file-access error afterwards, if this does return.
3827  */
3828 void
3829 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3830 {
3831         int                     save_errno = errno;
3832         XLogSegNo       lastRemovedSegNo;
3833
3834         SpinLockAcquire(&XLogCtl->info_lck);
3835         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3836         SpinLockRelease(&XLogCtl->info_lck);
3837
3838         if (segno <= lastRemovedSegNo)
3839         {
3840                 char            filename[MAXFNAMELEN];
3841
3842                 XLogFileName(filename, tli, segno, wal_segment_size);
3843                 errno = save_errno;
3844                 ereport(ERROR,
3845                                 (errcode_for_file_access(),
3846                                  errmsg("requested WAL segment %s has already been removed",
3847                                                 filename)));
3848         }
3849         errno = save_errno;
3850 }
3851
3852 /*
3853  * Return the last WAL segment removed, or 0 if no segment has been removed
3854  * since startup.
3855  *
3856  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3857  * with that.
3858  */
3859 XLogSegNo
3860 XLogGetLastRemovedSegno(void)
3861 {
3862         XLogSegNo       lastRemovedSegNo;
3863
3864         SpinLockAcquire(&XLogCtl->info_lck);
3865         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3866         SpinLockRelease(&XLogCtl->info_lck);
3867
3868         return lastRemovedSegNo;
3869 }
3870
3871 /*
3872  * Update the last removed segno pointer in shared memory, to reflect
3873  * that the given XLOG file has been removed.
3874  */
3875 static void
3876 UpdateLastRemovedPtr(char *filename)
3877 {
3878         uint32          tli;
3879         XLogSegNo       segno;
3880
3881         XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3882
3883         SpinLockAcquire(&XLogCtl->info_lck);
3884         if (segno > XLogCtl->lastRemovedSegNo)
3885                 XLogCtl->lastRemovedSegNo = segno;
3886         SpinLockRelease(&XLogCtl->info_lck);
3887 }
3888
3889 /*
3890  * Remove all temporary log files in pg_wal
3891  *
3892  * This is called at the beginning of recovery after a previous crash,
3893  * at a point where no other processes write fresh WAL data.
3894  */
3895 static void
3896 RemoveTempXlogFiles(void)
3897 {
3898         DIR                *xldir;
3899         struct dirent *xlde;
3900
3901         elog(DEBUG2, "removing all temporary WAL segments");
3902
3903         xldir = AllocateDir(XLOGDIR);
3904         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3905         {
3906                 char            path[MAXPGPATH];
3907
3908                 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3909                         continue;
3910
3911                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3912                 unlink(path);
3913                 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3914         }
3915         FreeDir(xldir);
3916 }
3917
3918 /*
3919  * Recycle or remove all log files older or equal to passed segno.
3920  *
3921  * endptr is current (or recent) end of xlog, and RedoRecPtr is the
3922  * redo pointer of the last checkpoint. These are used to determine
3923  * whether we want to recycle rather than delete no-longer-wanted log files.
3924  */
3925 static void
3926 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
3927 {
3928         DIR                *xldir;
3929         struct dirent *xlde;
3930         char            lastoff[MAXFNAMELEN];
3931
3932         /*
3933          * Construct a filename of the last segment to be kept. The timeline ID
3934          * doesn't matter, we ignore that in the comparison. (During recovery,
3935          * ThisTimeLineID isn't set, so we can't use that.)
3936          */
3937         XLogFileName(lastoff, 0, segno, wal_segment_size);
3938
3939         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3940                  lastoff);
3941
3942         xldir = AllocateDir(XLOGDIR);
3943
3944         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3945         {
3946                 /* Ignore files that are not XLOG segments */
3947                 if (!IsXLogFileName(xlde->d_name) &&
3948                         !IsPartialXLogFileName(xlde->d_name))
3949                         continue;
3950
3951                 /*
3952                  * We ignore the timeline part of the XLOG segment identifiers in
3953                  * deciding whether a segment is still needed.  This ensures that we
3954                  * won't prematurely remove a segment from a parent timeline. We could
3955                  * probably be a little more proactive about removing segments of
3956                  * non-parent timelines, but that would be a whole lot more
3957                  * complicated.
3958                  *
3959                  * We use the alphanumeric sorting property of the filenames to decide
3960                  * which ones are earlier than the lastoff segment.
3961                  */
3962                 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3963                 {
3964                         if (XLogArchiveCheckDone(xlde->d_name))
3965                         {
3966                                 /* Update the last removed location in shared memory first */
3967                                 UpdateLastRemovedPtr(xlde->d_name);
3968
3969                                 RemoveXlogFile(xlde->d_name, RedoRecPtr, endptr);
3970                         }
3971                 }
3972         }
3973
3974         FreeDir(xldir);
3975 }
3976
3977 /*
3978  * Remove WAL files that are not part of the given timeline's history.
3979  *
3980  * This is called during recovery, whenever we switch to follow a new
3981  * timeline, and at the end of recovery when we create a new timeline. We
3982  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3983  * might be leftover pre-allocated or recycled WAL segments on the old timeline
3984  * that we haven't used yet, and contain garbage. If we just leave them in
3985  * pg_wal, they will eventually be archived, and we can't let that happen.
3986  * Files that belong to our timeline history are valid, because we have
3987  * successfully replayed them, but from others we can't be sure.
3988  *
3989  * 'switchpoint' is the current point in WAL where we switch to new timeline,
3990  * and 'newTLI' is the new timeline we switch to.
3991  */
3992 static void
3993 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3994 {
3995         DIR                *xldir;
3996         struct dirent *xlde;
3997         char            switchseg[MAXFNAMELEN];
3998         XLogSegNo       endLogSegNo;
3999
4000         XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
4001
4002         /*
4003          * Construct a filename of the last segment to be kept.
4004          */
4005         XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
4006
4007         elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4008                  switchseg);
4009
4010         xldir = AllocateDir(XLOGDIR);
4011
4012         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4013         {
4014                 /* Ignore files that are not XLOG segments */
4015                 if (!IsXLogFileName(xlde->d_name))
4016                         continue;
4017
4018                 /*
4019                  * Remove files that are on a timeline older than the new one we're
4020                  * switching to, but with a segment number >= the first segment on the
4021                  * new timeline.
4022                  */
4023                 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4024                         strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4025                 {
4026                         /*
4027                          * If the file has already been marked as .ready, however, don't
4028                          * remove it yet. It should be OK to remove it - files that are
4029                          * not part of our timeline history are not required for recovery
4030                          * - but seems safer to let them be archived and removed later.
4031                          */
4032                         if (!XLogArchiveIsReady(xlde->d_name))
4033                                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4034                 }
4035         }
4036
4037         FreeDir(xldir);
4038 }
4039
4040 /*
4041  * Recycle or remove a log file that's no longer needed.
4042  *
4043  * endptr is current (or recent) end of xlog, and RedoRecPtr is the
4044  * redo pointer of the last checkpoint. These are used to determine
4045  * whether we want to recycle rather than delete no-longer-wanted log files.
4046  * If RedoRecPtr is not known, pass invalid, and the function will recycle,
4047  * somewhat arbitrarily, 10 future segments.
4048  */
4049 static void
4050 RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
4051 {
4052         char            path[MAXPGPATH];
4053 #ifdef WIN32
4054         char            newpath[MAXPGPATH];
4055 #endif
4056         struct stat statbuf;
4057         XLogSegNo       endlogSegNo;
4058         XLogSegNo       recycleSegNo;
4059
4060         /*
4061          * Initialize info about where to try to recycle to.
4062          */
4063         XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4064         if (RedoRecPtr == InvalidXLogRecPtr)
4065                 recycleSegNo = endlogSegNo + 10;
4066         else
4067                 recycleSegNo = XLOGfileslop(RedoRecPtr);
4068
4069         snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4070
4071         /*
4072          * Before deleting the file, see if it can be recycled as a future log
4073          * segment. Only recycle normal files, pg_standby for example can create
4074          * symbolic links pointing to a separate archive directory.
4075          */
4076         if (endlogSegNo <= recycleSegNo &&
4077                 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4078                 InstallXLogFileSegment(&endlogSegNo, path,
4079                                                            true, recycleSegNo, true))
4080         {
4081                 ereport(DEBUG2,
4082                                 (errmsg("recycled write-ahead log file \"%s\"",
4083                                                 segname)));
4084                 CheckpointStats.ckpt_segs_recycled++;
4085                 /* Needn't recheck that slot on future iterations */
4086                 endlogSegNo++;
4087         }
4088         else
4089         {
4090                 /* No need for any more future segments... */
4091                 int                     rc;
4092
4093                 ereport(DEBUG2,
4094                                 (errmsg("removing write-ahead log file \"%s\"",
4095                                                 segname)));
4096
4097 #ifdef WIN32
4098
4099                 /*
4100                  * On Windows, if another process (e.g another backend) holds the file
4101                  * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4102                  * will still show up in directory listing until the last handle is
4103                  * closed. To avoid confusing the lingering deleted file for a live
4104                  * WAL file that needs to be archived, rename it before deleting it.
4105                  *
4106                  * If another process holds the file open without FILE_SHARE_DELETE
4107                  * flag, rename will fail. We'll try again at the next checkpoint.
4108                  */
4109                 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4110                 if (rename(path, newpath) != 0)
4111                 {
4112                         ereport(LOG,
4113                                         (errcode_for_file_access(),
4114                                          errmsg("could not rename file \"%s\": %m",
4115                                                         path)));
4116                         return;
4117                 }
4118                 rc = durable_unlink(newpath, LOG);
4119 #else
4120                 rc = durable_unlink(path, LOG);
4121 #endif
4122                 if (rc != 0)
4123                 {
4124                         /* Message already logged by durable_unlink() */
4125                         return;
4126                 }
4127                 CheckpointStats.ckpt_segs_removed++;
4128         }
4129
4130         XLogArchiveCleanup(segname);
4131 }
4132
4133 /*
4134  * Verify whether pg_wal and pg_wal/archive_status exist.
4135  * If the latter does not exist, recreate it.
4136  *
4137  * It is not the goal of this function to verify the contents of these
4138  * directories, but to help in cases where someone has performed a cluster
4139  * copy for PITR purposes but omitted pg_wal from the copy.
4140  *
4141  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4142  * policy decision was made not to.  It is fairly common for pg_wal to be
4143  * a symlink, and if that was the DBA's intent then automatically making a
4144  * plain directory would result in degraded performance with no notice.
4145  */
4146 static void
4147 ValidateXLOGDirectoryStructure(void)
4148 {
4149         char            path[MAXPGPATH];
4150         struct stat stat_buf;
4151
4152         /* Check for pg_wal; if it doesn't exist, error out */
4153         if (stat(XLOGDIR, &stat_buf) != 0 ||
4154                 !S_ISDIR(stat_buf.st_mode))
4155                 ereport(FATAL,
4156                                 (errmsg("required WAL directory \"%s\" does not exist",
4157                                                 XLOGDIR)));
4158
4159         /* Check for archive_status */
4160         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4161         if (stat(path, &stat_buf) == 0)
4162         {
4163                 /* Check for weird cases where it exists but isn't a directory */
4164                 if (!S_ISDIR(stat_buf.st_mode))
4165                         ereport(FATAL,
4166                                         (errmsg("required WAL directory \"%s\" does not exist",
4167                                                         path)));
4168         }
4169         else
4170         {
4171                 ereport(LOG,
4172                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4173                 if (MakePGDirectory(path) < 0)
4174                         ereport(FATAL,
4175                                         (errmsg("could not create missing directory \"%s\": %m",
4176                                                         path)));
4177         }
4178 }
4179
4180 /*
4181  * Remove previous backup history files.  This also retries creation of
4182  * .ready files for any backup history files for which XLogArchiveNotify
4183  * failed earlier.
4184  */
4185 static void
4186 CleanupBackupHistory(void)
4187 {
4188         DIR                *xldir;
4189         struct dirent *xlde;
4190         char            path[MAXPGPATH + sizeof(XLOGDIR)];
4191
4192         xldir = AllocateDir(XLOGDIR);
4193
4194         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4195         {
4196                 if (IsBackupHistoryFileName(xlde->d_name))
4197                 {
4198                         if (XLogArchiveCheckDone(xlde->d_name))
4199                         {
4200                                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4201                                          xlde->d_name);
4202                                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4203                                 unlink(path);
4204                                 XLogArchiveCleanup(xlde->d_name);
4205                         }
4206                 }
4207         }
4208
4209         FreeDir(xldir);
4210 }
4211
4212 /*
4213  * Attempt to read an XLOG record.
4214  *
4215  * If RecPtr is valid, try to read a record at that position.  Otherwise
4216  * try to read a record just after the last one previously read.
4217  *
4218  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4219  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4220  * record is available.
4221  *
4222  * The record is copied into readRecordBuf, so that on successful return,
4223  * the returned record pointer always points there.
4224  */
4225 static XLogRecord *
4226 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4227                    bool fetching_ckpt)
4228 {
4229         XLogRecord *record;
4230         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4231
4232         /* Pass through parameters to XLogPageRead */
4233         private->fetching_ckpt = fetching_ckpt;
4234         private->emode = emode;
4235         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4236
4237         /* This is the first attempt to read this page. */
4238         lastSourceFailed = false;
4239
4240         for (;;)
4241         {
4242                 char       *errormsg;
4243
4244                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4245                 ReadRecPtr = xlogreader->ReadRecPtr;
4246                 EndRecPtr = xlogreader->EndRecPtr;
4247                 if (record == NULL)
4248                 {
4249                         if (readFile >= 0)
4250                         {
4251                                 close(readFile);
4252                                 readFile = -1;
4253                         }
4254
4255                         /*
4256                          * We only end up here without a message when XLogPageRead()
4257                          * failed - in that case we already logged something. In
4258                          * StandbyMode that only happens if we have been triggered, so we
4259                          * shouldn't loop anymore in that case.
4260                          */
4261                         if (errormsg)
4262                                 ereport(emode_for_corrupt_record(emode,
4263                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4264                                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4265                 }
4266
4267                 /*
4268                  * Check page TLI is one of the expected values.
4269                  */
4270                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4271                 {
4272                         char            fname[MAXFNAMELEN];
4273                         XLogSegNo       segno;
4274                         int32           offset;
4275
4276                         XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4277                         offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4278                                                                            wal_segment_size);
4279                         XLogFileName(fname, xlogreader->readPageTLI, segno,
4280                                                  wal_segment_size);
4281                         ereport(emode_for_corrupt_record(emode,
4282                                                                                          RecPtr ? RecPtr : EndRecPtr),
4283                                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4284                                                         xlogreader->latestPageTLI,
4285                                                         fname,
4286                                                         offset)));
4287                         record = NULL;
4288                 }
4289
4290                 if (record)
4291                 {
4292                         /* Great, got a record */
4293                         return record;
4294                 }
4295                 else
4296                 {
4297                         /* No valid record available from this source */
4298                         lastSourceFailed = true;
4299
4300                         /*
4301                          * If archive recovery was requested, but we were still doing
4302                          * crash recovery, switch to archive recovery and retry using the
4303                          * offline archive. We have now replayed all the valid WAL in
4304                          * pg_wal, so we are presumably now consistent.
4305                          *
4306                          * We require that there's at least some valid WAL present in
4307                          * pg_wal, however (!fetching_ckpt).  We could recover using the
4308                          * WAL from the archive, even if pg_wal is completely empty, but
4309                          * we'd have no idea how far we'd have to replay to reach
4310                          * consistency.  So err on the safe side and give up.
4311                          */
4312                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4313                                 !fetching_ckpt)
4314                         {
4315                                 ereport(DEBUG1,
4316                                                 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4317                                 InArchiveRecovery = true;
4318                                 if (StandbyModeRequested)
4319                                         StandbyMode = true;
4320
4321                                 /* initialize minRecoveryPoint to this record */
4322                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4323                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4324                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4325                                 {
4326                                         ControlFile->minRecoveryPoint = EndRecPtr;
4327                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4328                                 }
4329                                 /* update local copy */
4330                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4331                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4332
4333                                 /*
4334                                  * The startup process can update its local copy of
4335                                  * minRecoveryPoint from this point.
4336                                  */
4337                                 updateMinRecoveryPoint = true;
4338
4339                                 UpdateControlFile();
4340                                 LWLockRelease(ControlFileLock);
4341
4342                                 CheckRecoveryConsistency();
4343
4344                                 /*
4345                                  * Before we retry, reset lastSourceFailed and currentSource
4346                                  * so that we will check the archive next.
4347                                  */
4348                                 lastSourceFailed = false;
4349                                 currentSource = 0;
4350
4351                                 continue;
4352                         }
4353
4354                         /* In standby mode, loop back to retry. Otherwise, give up. */
4355                         if (StandbyMode && !CheckForStandbyTrigger())
4356                                 continue;
4357                         else
4358                                 return NULL;
4359                 }
4360         }
4361 }
4362
4363 /*
4364  * Scan for new timelines that might have appeared in the archive since we
4365  * started recovery.
4366  *
4367  * If there are any, the function changes recovery target TLI to the latest
4368  * one and returns 'true'.
4369  */
4370 static bool
4371 rescanLatestTimeLine(void)
4372 {
4373         List       *newExpectedTLEs;
4374         bool            found;
4375         ListCell   *cell;
4376         TimeLineID      newtarget;
4377         TimeLineID      oldtarget = recoveryTargetTLI;
4378         TimeLineHistoryEntry *currentTle = NULL;
4379
4380         newtarget = findNewestTimeLine(recoveryTargetTLI);
4381         if (newtarget == recoveryTargetTLI)
4382         {
4383                 /* No new timelines found */
4384                 return false;
4385         }
4386
4387         /*
4388          * Determine the list of expected TLIs for the new TLI
4389          */
4390
4391         newExpectedTLEs = readTimeLineHistory(newtarget);
4392
4393         /*
4394          * If the current timeline is not part of the history of the new timeline,
4395          * we cannot proceed to it.
4396          */
4397         found = false;
4398         foreach(cell, newExpectedTLEs)
4399         {
4400                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4401
4402                 if (currentTle->tli == recoveryTargetTLI)
4403                 {
4404                         found = true;
4405                         break;
4406                 }
4407         }
4408         if (!found)
4409         {
4410                 ereport(LOG,
4411                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4412                                                 newtarget,
4413                                                 ThisTimeLineID)));
4414                 return false;
4415         }
4416
4417         /*
4418          * The current timeline was found in the history file, but check that the
4419          * next timeline was forked off from it *after* the current recovery
4420          * location.
4421          */
4422         if (currentTle->end < EndRecPtr)
4423         {
4424                 ereport(LOG,
4425                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4426                                                 newtarget,
4427                                                 ThisTimeLineID,
4428                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4429                 return false;
4430         }
4431
4432         /* The new timeline history seems valid. Switch target */
4433         recoveryTargetTLI = newtarget;
4434         list_free_deep(expectedTLEs);
4435         expectedTLEs = newExpectedTLEs;
4436
4437         /*
4438          * As in StartupXLOG(), try to ensure we have all the history files
4439          * between the old target and new target in pg_wal.
4440          */
4441         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4442
4443         ereport(LOG,
4444                         (errmsg("new target timeline is %u",
4445                                         recoveryTargetTLI)));
4446
4447         return true;
4448 }
4449
4450 /*
4451  * I/O routines for pg_control
4452  *
4453  * *ControlFile is a buffer in shared memory that holds an image of the
4454  * contents of pg_control.  WriteControlFile() initializes pg_control
4455  * given a preloaded buffer, ReadControlFile() loads the buffer from
4456  * the pg_control file (during postmaster or standalone-backend startup),
4457  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4458  *
4459  * For simplicity, WriteControlFile() initializes the fields of pg_control
4460  * that are related to checking backend/database compatibility, and
4461  * ReadControlFile() verifies they are correct.  We could split out the
4462  * I/O and compatibility-check functions, but there seems no need currently.
4463  */
4464 static void
4465 WriteControlFile(void)
4466 {
4467         int                     fd;
4468         char            buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
4469
4470         /*
4471          * Ensure that the size of the pg_control data structure is sane.  See the
4472          * comments for these symbols in pg_control.h.
4473          */
4474         StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4475                                          "pg_control is too large for atomic disk writes");
4476         StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4477                                          "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4478
4479         /*
4480          * Initialize version and compatibility-check fields
4481          */
4482         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4483         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4484
4485         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4486         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4487
4488         ControlFile->blcksz = BLCKSZ;
4489         ControlFile->relseg_size = RELSEG_SIZE;
4490         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4491         ControlFile->xlog_seg_size = wal_segment_size;
4492
4493         ControlFile->nameDataLen = NAMEDATALEN;
4494         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4495
4496         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4497         ControlFile->loblksize = LOBLKSIZE;
4498
4499         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4500         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4501
4502         /* Contents are protected with a CRC */
4503         INIT_CRC32C(ControlFile->crc);
4504         COMP_CRC32C(ControlFile->crc,
4505                                 (char *) ControlFile,
4506                                 offsetof(ControlFileData, crc));
4507         FIN_CRC32C(ControlFile->crc);
4508
4509         /*
4510          * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4511          * the excess over sizeof(ControlFileData).  This reduces the odds of
4512          * premature-EOF errors when reading pg_control.  We'll still fail when we
4513          * check the contents of the file, but hopefully with a more specific
4514          * error than "couldn't read pg_control".
4515          */
4516         memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4517         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4518
4519         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4520                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4521         if (fd < 0)
4522                 ereport(PANIC,
4523                                 (errcode_for_file_access(),
4524                                  errmsg("could not create file \"%s\": %m",
4525                                                 XLOG_CONTROL_FILE)));
4526
4527         errno = 0;
4528         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4529         if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4530         {
4531                 /* if write didn't set errno, assume problem is no disk space */
4532                 if (errno == 0)
4533                         errno = ENOSPC;
4534                 ereport(PANIC,
4535                                 (errcode_for_file_access(),
4536                                  errmsg("could not write to file \"%s\": %m",
4537                                                 XLOG_CONTROL_FILE)));
4538         }
4539         pgstat_report_wait_end();
4540
4541         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4542         if (pg_fsync(fd) != 0)
4543                 ereport(PANIC,
4544                                 (errcode_for_file_access(),
4545                                  errmsg("could not fsync file \"%s\": %m",
4546                                                 XLOG_CONTROL_FILE)));
4547         pgstat_report_wait_end();
4548
4549         if (close(fd))
4550                 ereport(PANIC,
4551                                 (errcode_for_file_access(),
4552                                  errmsg("could not close file \"%s\": %m",
4553                                                 XLOG_CONTROL_FILE)));
4554 }
4555
4556 static void
4557 ReadControlFile(void)
4558 {
4559         pg_crc32c       crc;
4560         int                     fd;
4561         static char wal_segsz_str[20];
4562         int                     r;
4563
4564         /*
4565          * Read data...
4566          */
4567         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4568                                            O_RDWR | PG_BINARY);
4569         if (fd < 0)
4570                 ereport(PANIC,
4571                                 (errcode_for_file_access(),
4572                                  errmsg("could not open file \"%s\": %m",
4573                                                 XLOG_CONTROL_FILE)));
4574
4575         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4576         r = read(fd, ControlFile, sizeof(ControlFileData));
4577         if (r != sizeof(ControlFileData))
4578         {
4579                 if (r < 0)
4580                         ereport(PANIC,
4581                                         (errcode_for_file_access(),
4582                                          errmsg("could not read file \"%s\": %m",
4583                                                         XLOG_CONTROL_FILE)));
4584                 else
4585                         ereport(PANIC,
4586                                         (errcode(ERRCODE_DATA_CORRUPTED),
4587                                          errmsg("could not read file \"%s\": read %d of %zu",
4588                                                         XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4589         }
4590         pgstat_report_wait_end();
4591
4592         close(fd);
4593
4594         /*
4595          * Check for expected pg_control format version.  If this is wrong, the
4596          * CRC check will likely fail because we'll be checking the wrong number
4597          * of bytes.  Complaining about wrong version will probably be more
4598          * enlightening than complaining about wrong CRC.
4599          */
4600
4601         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4602                 ereport(FATAL,
4603                                 (errmsg("database files are incompatible with server"),
4604                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4605                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4606                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
4607                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4608                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4609
4610         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4611                 ereport(FATAL,
4612                                 (errmsg("database files are incompatible with server"),
4613                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4614                                                    " but the server was compiled with PG_CONTROL_VERSION %d.",
4615                                                    ControlFile->pg_control_version, PG_CONTROL_VERSION),
4616                                  errhint("It looks like you need to initdb.")));
4617
4618         /* Now check the CRC. */
4619         INIT_CRC32C(crc);
4620         COMP_CRC32C(crc,
4621                                 (char *) ControlFile,
4622                                 offsetof(ControlFileData, crc));
4623         FIN_CRC32C(crc);
4624
4625         if (!EQ_CRC32C(crc, ControlFile->crc))
4626                 ereport(FATAL,
4627                                 (errmsg("incorrect checksum in control file")));
4628
4629         /*
4630          * Do compatibility checking immediately.  If the database isn't
4631          * compatible with the backend executable, we want to abort before we can
4632          * possibly do any damage.
4633          */
4634         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4635                 ereport(FATAL,
4636                                 (errmsg("database files are incompatible with server"),
4637                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4638                                                    " but the server was compiled with CATALOG_VERSION_NO %d.",
4639                                                    ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4640                                  errhint("It looks like you need to initdb.")));
4641         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4642                 ereport(FATAL,
4643                                 (errmsg("database files are incompatible with server"),
4644                                  errdetail("The database cluster was initialized with MAXALIGN %d,"
4645                                                    " but the server was compiled with MAXALIGN %d.",
4646                                                    ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4647                                  errhint("It looks like you need to initdb.")));
4648         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4649                 ereport(FATAL,
4650                                 (errmsg("database files are incompatible with server"),
4651                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4652                                  errhint("It looks like you need to initdb.")));
4653         if (ControlFile->blcksz != BLCKSZ)
4654                 ereport(FATAL,
4655                                 (errmsg("database files are incompatible with server"),
4656                                  errdetail("The database cluster was initialized with BLCKSZ %d,"
4657                                                    " but the server was compiled with BLCKSZ %d.",
4658                                                    ControlFile->blcksz, BLCKSZ),
4659                                  errhint("It looks like you need to recompile or initdb.")));
4660         if (ControlFile->relseg_size != RELSEG_SIZE)
4661                 ereport(FATAL,
4662                                 (errmsg("database files are incompatible with server"),
4663                                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4664                                                    " but the server was compiled with RELSEG_SIZE %d.",
4665                                                    ControlFile->relseg_size, RELSEG_SIZE),
4666                                  errhint("It looks like you need to recompile or initdb.")));
4667         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4668                 ereport(FATAL,
4669                                 (errmsg("database files are incompatible with server"),
4670                                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4671                                                    " but the server was compiled with XLOG_BLCKSZ %d.",
4672                                                    ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4673                                  errhint("It looks like you need to recompile or initdb.")));
4674         if (ControlFile->nameDataLen != NAMEDATALEN)
4675                 ereport(FATAL,
4676                                 (errmsg("database files are incompatible with server"),
4677                                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4678                                                    " but the server was compiled with NAMEDATALEN %d.",
4679                                                    ControlFile->nameDataLen, NAMEDATALEN),
4680                                  errhint("It looks like you need to recompile or initdb.")));
4681         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4682                 ereport(FATAL,
4683                                 (errmsg("database files are incompatible with server"),
4684                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4685                                                    " but the server was compiled with INDEX_MAX_KEYS %d.",
4686                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4687                                  errhint("It looks like you need to recompile or initdb.")));
4688         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4689                 ereport(FATAL,
4690                                 (errmsg("database files are incompatible with server"),
4691                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4692                                                    " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4693                                                    ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4694                                  errhint("It looks like you need to recompile or initdb.")));
4695         if (ControlFile->loblksize != LOBLKSIZE)
4696                 ereport(FATAL,
4697                                 (errmsg("database files are incompatible with server"),
4698                                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4699                                                    " but the server was compiled with LOBLKSIZE %d.",
4700                                                    ControlFile->loblksize, (int) LOBLKSIZE),
4701                                  errhint("It looks like you need to recompile or initdb.")));
4702
4703 #ifdef USE_FLOAT4_BYVAL
4704         if (ControlFile->float4ByVal != true)
4705                 ereport(FATAL,
4706                                 (errmsg("database files are incompatible with server"),
4707                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4708                                                    " but the server was compiled with USE_FLOAT4_BYVAL."),
4709                                  errhint("It looks like you need to recompile or initdb.")));
4710 #else
4711         if (ControlFile->float4ByVal != false)
4712                 ereport(FATAL,
4713                                 (errmsg("database files are incompatible with server"),
4714                                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4715                                                    " but the server was compiled without USE_FLOAT4_BYVAL."),
4716                                  errhint("It looks like you need to recompile or initdb.")));
4717 #endif
4718
4719 #ifdef USE_FLOAT8_BYVAL
4720         if (ControlFile->float8ByVal != true)
4721                 ereport(FATAL,
4722                                 (errmsg("database files are incompatible with server"),
4723                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4724                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4725                                  errhint("It looks like you need to recompile or initdb.")));
4726 #else
4727         if (ControlFile->float8ByVal != false)
4728                 ereport(FATAL,
4729                                 (errmsg("database files are incompatible with server"),
4730                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4731                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4732                                  errhint("It looks like you need to recompile or initdb.")));
4733 #endif
4734
4735         wal_segment_size = ControlFile->xlog_seg_size;
4736
4737         if (!IsValidWalSegSize(wal_segment_size))
4738                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4739                                                 errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4740                                                                           "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4741                                                                           wal_segment_size,
4742                                                                           wal_segment_size)));
4743
4744         snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4745         SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4746                                         PGC_S_OVERRIDE);
4747
4748         /* check and update variables dependent on wal_segment_size */
4749         if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4750                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4751                                                 errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\".")));
4752
4753         if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4754                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4755                                                 errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\".")));
4756
4757         UsableBytesInSegment =
4758                 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4759                 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4760
4761         CalculateCheckpointSegments();
4762
4763         /* Make the initdb settings visible as GUC variables, too */
4764         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4765                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4766 }
4767
4768 void
4769 UpdateControlFile(void)
4770 {
4771         int                     fd;
4772
4773         INIT_CRC32C(ControlFile->crc);
4774         COMP_CRC32C(ControlFile->crc,
4775                                 (char *) ControlFile,
4776                                 offsetof(ControlFileData, crc));
4777         FIN_CRC32C(ControlFile->crc);
4778
4779         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4780                                            O_RDWR | PG_BINARY);
4781         if (fd < 0)
4782                 ereport(PANIC,
4783                                 (errcode_for_file_access(),
4784                                  errmsg("could not open file \"%s\": %m", XLOG_CONTROL_FILE)));
4785
4786         errno = 0;
4787         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
4788         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4789         {
4790                 /* if write didn't set errno, assume problem is no disk space */
4791                 if (errno == 0)
4792                         errno = ENOSPC;
4793                 ereport(PANIC,
4794                                 (errcode_for_file_access(),
4795                                  errmsg("could not write to file \"%s\": %m",
4796                                                 XLOG_CONTROL_FILE)));
4797         }
4798         pgstat_report_wait_end();
4799
4800         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
4801         if (pg_fsync(fd) != 0)
4802                 ereport(PANIC,
4803                                 (errcode_for_file_access(),
4804                                  errmsg("could not fsync file \"%s\": %m",
4805                                                 XLOG_CONTROL_FILE)));
4806         pgstat_report_wait_end();
4807
4808         if (close(fd))
4809                 ereport(PANIC,
4810                                 (errcode_for_file_access(),
4811                                  errmsg("could not close file \"%s\": %m",
4812                                                 XLOG_CONTROL_FILE)));
4813 }
4814
4815 /*
4816  * Returns the unique system identifier from control file.
4817  */
4818 uint64
4819 GetSystemIdentifier(void)
4820 {
4821         Assert(ControlFile != NULL);
4822         return ControlFile->system_identifier;
4823 }
4824
4825 /*
4826  * Returns the random nonce from control file.
4827  */
4828 char *
4829 GetMockAuthenticationNonce(void)
4830 {
4831         Assert(ControlFile != NULL);
4832         return ControlFile->mock_authentication_nonce;
4833 }
4834
4835 /*
4836  * Are checksums enabled for data pages?
4837  */
4838 bool
4839 DataChecksumsEnabled(void)
4840 {
4841         Assert(ControlFile != NULL);
4842         return (ControlFile->data_checksum_version > 0);
4843 }
4844
4845 /*
4846  * Returns a fake LSN for unlogged relations.
4847  *
4848  * Each call generates an LSN that is greater than any previous value
4849  * returned. The current counter value is saved and restored across clean
4850  * shutdowns, but like unlogged relations, does not survive a crash. This can
4851  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4852  * LSN-like increasing sequence of numbers without writing any WAL.
4853  */
4854 XLogRecPtr
4855 GetFakeLSNForUnloggedRel(void)
4856 {
4857         XLogRecPtr      nextUnloggedLSN;
4858
4859         /* increment the unloggedLSN counter, need SpinLock */
4860         SpinLockAcquire(&XLogCtl->ulsn_lck);
4861         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4862         SpinLockRelease(&XLogCtl->ulsn_lck);
4863
4864         return nextUnloggedLSN;
4865 }
4866
4867 /*
4868  * Auto-tune the number of XLOG buffers.
4869  *
4870  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4871  * a maximum of one XLOG segment (there is little reason to think that more
4872  * is helpful, at least so long as we force an fsync when switching log files)
4873  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4874  * 9.1, when auto-tuning was added).
4875  *
4876  * This should not be called until NBuffers has received its final value.
4877  */
4878 static int
4879 XLOGChooseNumBuffers(void)
4880 {
4881         int                     xbuffers;
4882
4883         xbuffers = NBuffers / 32;
4884         if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4885                 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4886         if (xbuffers < 8)
4887                 xbuffers = 8;
4888         return xbuffers;
4889 }
4890
4891 /*
4892  * GUC check_hook for wal_buffers
4893  */
4894 bool
4895 check_wal_buffers(int *newval, void **extra, GucSource source)
4896 {
4897         /*
4898          * -1 indicates a request for auto-tune.
4899          */
4900         if (*newval == -1)
4901         {
4902                 /*
4903                  * If we haven't yet changed the boot_val default of -1, just let it
4904                  * be.  We'll fix it when XLOGShmemSize is called.
4905                  */
4906                 if (XLOGbuffers == -1)
4907                         return true;
4908
4909                 /* Otherwise, substitute the auto-tune value */
4910                 *newval = XLOGChooseNumBuffers();
4911         }
4912
4913         /*
4914          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4915          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4916          * the case, we just silently treat such values as a request for the
4917          * minimum.  (We could throw an error instead, but that doesn't seem very
4918          * helpful.)
4919          */
4920         if (*newval < 4)
4921                 *newval = 4;
4922
4923         return true;
4924 }
4925
4926 /*
4927  * Read the control file, set respective GUCs.
4928  *
4929  * This is to be called during startup, including a crash recovery cycle,
4930  * unless in bootstrap mode, where no control file yet exists.  As there's no
4931  * usable shared memory yet (its sizing can depend on the contents of the
4932  * control file!), first store the contents in local memory. XLOGShmemInit()
4933  * will then copy it to shared memory later.
4934  *
4935  * reset just controls whether previous contents are to be expected (in the
4936  * reset case, there's a dangling pointer into old shared memory), or not.
4937  */
4938 void
4939 LocalProcessControlFile(bool reset)
4940 {
4941         Assert(reset || ControlFile == NULL);
4942         ControlFile = palloc(sizeof(ControlFileData));
4943         ReadControlFile();
4944 }
4945
4946 /*
4947  * Initialization of shared memory for XLOG
4948  */
4949 Size
4950 XLOGShmemSize(void)
4951 {
4952         Size            size;
4953
4954         /*
4955          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4956          * This isn't an amazingly clean place to do this, but we must wait till
4957          * NBuffers has received its final value, and must do it before using the
4958          * value of XLOGbuffers to do anything important.
4959          */
4960         if (XLOGbuffers == -1)
4961         {
4962                 char            buf[32];
4963
4964                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4965                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4966         }
4967         Assert(XLOGbuffers > 0);
4968
4969         /* XLogCtl */
4970         size = sizeof(XLogCtlData);
4971
4972         /* WAL insertion locks, plus alignment */
4973         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4974         /* xlblocks array */
4975         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4976         /* extra alignment padding for XLOG I/O buffers */
4977         size = add_size(size, XLOG_BLCKSZ);
4978         /* and the buffers themselves */
4979         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4980
4981         /*
4982          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4983          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4984          * routine again below to compute the actual allocation size.
4985          */
4986
4987         return size;
4988 }
4989
4990 void
4991 XLOGShmemInit(void)
4992 {
4993         bool            foundCFile,
4994                                 foundXLog;
4995         char       *allocptr;
4996         int                     i;
4997         ControlFileData *localControlFile;
4998
4999 #ifdef WAL_DEBUG
5000
5001         /*
5002          * Create a memory context for WAL debugging that's exempt from the normal
5003          * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5004          * an allocation fails, but wal_debug is not for production use anyway.
5005          */
5006         if (walDebugCxt == NULL)
5007         {
5008                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5009                                                                                         "WAL Debug",
5010                                                                                         ALLOCSET_DEFAULT_SIZES);
5011                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
5012         }
5013 #endif
5014
5015
5016         XLogCtl = (XLogCtlData *)
5017                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5018
5019         localControlFile = ControlFile;
5020         ControlFile = (ControlFileData *)
5021                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5022
5023         if (foundCFile || foundXLog)
5024         {
5025                 /* both should be present or neither */
5026                 Assert(foundCFile && foundXLog);
5027
5028                 /* Initialize local copy of WALInsertLocks and register the tranche */
5029                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5030                 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
5031                                                           "wal_insert");
5032
5033                 if (localControlFile)
5034                         pfree(localControlFile);
5035                 return;
5036         }
5037         memset(XLogCtl, 0, sizeof(XLogCtlData));
5038
5039         /*
5040          * Already have read control file locally, unless in bootstrap mode. Move
5041          * contents into shared memory.
5042          */
5043         if (localControlFile)
5044         {
5045                 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5046                 pfree(localControlFile);
5047         }
5048
5049         /*
5050          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5051          * multiple of the alignment for same, so no extra alignment padding is
5052          * needed here.
5053          */
5054         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5055         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5056         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5057         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5058
5059
5060         /* WAL insertion locks. Ensure they're aligned to the full padded size */
5061         allocptr += sizeof(WALInsertLockPadded) -
5062                 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5063         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5064                 (WALInsertLockPadded *) allocptr;
5065         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5066
5067         LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5068         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5069         {
5070                 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5071                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5072                 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5073         }
5074
5075         /*
5076          * Align the start of the page buffers to a full xlog block size boundary.
5077          * This simplifies some calculations in XLOG insertion. It is also
5078          * required for O_DIRECT.
5079          */
5080         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5081         XLogCtl->pages = allocptr;
5082         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5083
5084         /*
5085          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5086          * in additional info.)
5087          */
5088         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5089         XLogCtl->SharedRecoveryInProgress = true;
5090         XLogCtl->SharedHotStandbyActive = false;
5091         XLogCtl->WalWriterSleeping = false;
5092
5093         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5094         SpinLockInit(&XLogCtl->info_lck);
5095         SpinLockInit(&XLogCtl->ulsn_lck);
5096         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5097 }
5098
5099 /*
5100  * This func must be called ONCE on system install.  It creates pg_control
5101  * and the initial XLOG segment.
5102  */
5103 void
5104 BootStrapXLOG(void)
5105 {
5106         CheckPoint      checkPoint;
5107         char       *buffer;
5108         XLogPageHeader page;
5109         XLogLongPageHeader longpage;
5110         XLogRecord *record;
5111         char       *recptr;
5112         bool            use_existent;
5113         uint64          sysidentifier;
5114         char            mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5115         struct timeval tv;
5116         pg_crc32c       crc;
5117
5118         /*
5119          * Select a hopefully-unique system identifier code for this installation.
5120          * We use the result of gettimeofday(), including the fractional seconds
5121          * field, as being about as unique as we can easily get.  (Think not to
5122          * use random(), since it hasn't been seeded and there's no portable way
5123          * to seed it other than the system clock value...)  The upper half of the
5124          * uint64 value is just the tv_sec part, while the lower half contains the
5125          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5126          * PID for a little extra uniqueness.  A person knowing this encoding can
5127          * determine the initialization time of the installation, which could
5128          * perhaps be useful sometimes.
5129          */
5130         gettimeofday(&tv, NULL);
5131         sysidentifier = ((uint64) tv.tv_sec) << 32;
5132         sysidentifier |= ((uint64) tv.tv_usec) << 12;
5133         sysidentifier |= getpid() & 0xFFF;
5134
5135         /*
5136          * Generate a random nonce. This is used for authentication requests that
5137          * will fail because the user does not exist. The nonce is used to create
5138          * a genuine-looking password challenge for the non-existent user, in lieu
5139          * of an actual stored password.
5140          */
5141         if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5142                 ereport(PANIC,
5143                                 (errcode(ERRCODE_INTERNAL_ERROR),
5144                                  errmsg("could not generate secret authorization token")));
5145
5146         /* First timeline ID is always 1 */
5147         ThisTimeLineID = 1;
5148
5149         /* page buffer must be aligned suitably for O_DIRECT */
5150         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5151         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5152         memset(page, 0, XLOG_BLCKSZ);
5153
5154         /*
5155          * Set up information for the initial checkpoint record
5156          *
5157          * The initial checkpoint record is written to the beginning of the WAL
5158          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5159          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5160          */
5161         checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5162         checkPoint.ThisTimeLineID = ThisTimeLineID;
5163         checkPoint.PrevTimeLineID = ThisTimeLineID;
5164         checkPoint.fullPageWrites = fullPageWrites;
5165         checkPoint.nextXidEpoch = 0;
5166         checkPoint.nextXid = FirstNormalTransactionId;
5167         checkPoint.nextOid = FirstBootstrapObjectId;
5168         checkPoint.nextMulti = FirstMultiXactId;
5169         checkPoint.nextMultiOffset = 0;
5170         checkPoint.oldestXid = FirstNormalTransactionId;
5171         checkPoint.oldestXidDB = TemplateDbOid;
5172         checkPoint.oldestMulti = FirstMultiXactId;
5173         checkPoint.oldestMultiDB = TemplateDbOid;
5174         checkPoint.oldestCommitTsXid = InvalidTransactionId;
5175         checkPoint.newestCommitTsXid = InvalidTransactionId;
5176         checkPoint.time = (pg_time_t) time(NULL);
5177         checkPoint.oldestActiveXid = InvalidTransactionId;
5178
5179         ShmemVariableCache->nextXid = checkPoint.nextXid;
5180         ShmemVariableCache->nextOid = checkPoint.nextOid;
5181         ShmemVariableCache->oidCount = 0;
5182         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5183         AdvanceOldestClogXid(checkPoint.oldestXid);
5184         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5185         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5186         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5187
5188         /* Set up the XLOG page header */
5189         page->xlp_magic = XLOG_PAGE_MAGIC;
5190         page->xlp_info = XLP_LONG_HEADER;
5191         page->xlp_tli = ThisTimeLineID;
5192         page->xlp_pageaddr = wal_segment_size;
5193         longpage = (XLogLongPageHeader) page;
5194         longpage->xlp_sysid = sysidentifier;
5195         longpage->xlp_seg_size = wal_segment_size;
5196         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5197
5198         /* Insert the initial checkpoint record */
5199         recptr = ((char *) page + SizeOfXLogLongPHD);
5200         record = (XLogRecord *) recptr;
5201         record->xl_prev = 0;
5202         record->xl_xid = InvalidTransactionId;
5203         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5204         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5205         record->xl_rmid = RM_XLOG_ID;
5206         recptr += SizeOfXLogRecord;
5207         /* fill the XLogRecordDataHeaderShort struct */
5208         *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5209         *(recptr++) = sizeof(checkPoint);
5210         memcpy(recptr, &checkPoint, sizeof(checkPoint));
5211         recptr += sizeof(checkPoint);
5212         Assert(recptr - (char *) record == record->xl_tot_len);
5213
5214         INIT_CRC32C(crc);
5215         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5216         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5217         FIN_CRC32C(crc);
5218         record->xl_crc = crc;
5219
5220         /* Create first XLOG segment file */
5221         use_existent = false;
5222         openLogFile = XLogFileInit(1, &use_existent, false);
5223
5224         /* Write the first page with the initial record */
5225         errno = 0;
5226         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5227         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5228         {
5229                 /* if write didn't set errno, assume problem is no disk space */
5230                 if (errno == 0)
5231                         errno = ENOSPC;
5232                 ereport(PANIC,
5233                                 (errcode_for_file_access(),
5234                                  errmsg("could not write bootstrap write-ahead log file: %m")));
5235         }
5236         pgstat_report_wait_end();
5237
5238         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5239         if (pg_fsync(openLogFile) != 0)
5240                 ereport(PANIC,
5241                                 (errcode_for_file_access(),
5242                                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
5243         pgstat_report_wait_end();
5244
5245         if (close(openLogFile))
5246                 ereport(PANIC,
5247                                 (errcode_for_file_access(),
5248                                  errmsg("could not close bootstrap write-ahead log file: %m")));
5249
5250         openLogFile = -1;
5251
5252         /* Now create pg_control */
5253
5254         memset(ControlFile, 0, sizeof(ControlFileData));
5255         /* Initialize pg_control status fields */
5256         ControlFile->system_identifier = sysidentifier;
5257         memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5258         ControlFile->state = DB_SHUTDOWNED;
5259         ControlFile->time = checkPoint.time;
5260         ControlFile->checkPoint = checkPoint.redo;
5261         ControlFile->checkPointCopy = checkPoint;
5262         ControlFile->unloggedLSN = 1;
5263
5264         /* Set important parameter values for use when replaying WAL */
5265         ControlFile->MaxConnections = MaxConnections;
5266         ControlFile->max_worker_processes = max_worker_processes;
5267         ControlFile->max_prepared_xacts = max_prepared_xacts;
5268         ControlFile->max_locks_per_xact = max_locks_per_xact;
5269         ControlFile->wal_level = wal_level;
5270         ControlFile->wal_log_hints = wal_log_hints;
5271         ControlFile->track_commit_timestamp = track_commit_timestamp;
5272         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5273
5274         /* some additional ControlFile fields are set in WriteControlFile() */
5275
5276         WriteControlFile();
5277
5278         /* Bootstrap the commit log, too */
5279         BootStrapCLOG();
5280         BootStrapCommitTs();
5281         BootStrapSUBTRANS();
5282         BootStrapMultiXact();
5283
5284         pfree(buffer);
5285
5286         /*
5287          * Force control file to be read - in contrast to normal processing we'd
5288          * otherwise never run the checks and GUC related initializations therein.
5289          */
5290         ReadControlFile();
5291 }
5292
5293 static char *
5294 str_time(pg_time_t tnow)
5295 {
5296         static char buf[128];
5297
5298         pg_strftime(buf, sizeof(buf),
5299                                 "%Y-%m-%d %H:%M:%S %Z",
5300                                 pg_localtime(&tnow, log_timezone));
5301
5302         return buf;
5303 }
5304
5305 /*
5306  * See if there is a recovery command file (recovery.conf), and if so
5307  * read in parameters for archive recovery and XLOG streaming.
5308  *
5309  * The file is parsed using the main configuration parser.
5310  */
5311 static void
5312 readRecoveryCommandFile(void)
5313 {
5314         FILE       *fd;
5315         TimeLineID      rtli = 0;
5316         bool            rtliGiven = false;
5317         ConfigVariable *item,
5318                            *head = NULL,
5319                            *tail = NULL;
5320         bool            recoveryTargetActionSet = false;
5321
5322
5323         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5324         if (fd == NULL)
5325         {
5326                 if (errno == ENOENT)
5327                         return;                         /* not there, so no archive recovery */
5328                 ereport(FATAL,
5329                                 (errcode_for_file_access(),
5330                                  errmsg("could not open recovery command file \"%s\": %m",
5331                                                 RECOVERY_COMMAND_FILE)));
5332         }
5333
5334         /*
5335          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5336          * no need to check the return value.
5337          */
5338         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5339
5340         FreeFile(fd);
5341
5342         for (item = head; item; item = item->next)
5343         {
5344                 if (strcmp(item->name, "restore_command") == 0)
5345                 {
5346                         recoveryRestoreCommand = pstrdup(item->value);
5347                         ereport(DEBUG2,
5348                                         (errmsg_internal("restore_command = '%s'",
5349                                                                          recoveryRestoreCommand)));
5350                 }
5351                 else if (strcmp(item->name, "recovery_end_command") == 0)
5352                 {
5353                         recoveryEndCommand = pstrdup(item->value);
5354                         ereport(DEBUG2,
5355                                         (errmsg_internal("recovery_end_command = '%s'",
5356                                                                          recoveryEndCommand)));
5357                 }
5358                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5359                 {
5360                         archiveCleanupCommand = pstrdup(item->value);
5361                         ereport(DEBUG2,
5362                                         (errmsg_internal("archive_cleanup_command = '%s'",
5363                                                                          archiveCleanupCommand)));
5364                 }
5365                 else if (strcmp(item->name, "recovery_target_action") == 0)
5366                 {
5367                         if (strcmp(item->value, "pause") == 0)
5368                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5369                         else if (strcmp(item->value, "promote") == 0)
5370                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5371                         else if (strcmp(item->value, "shutdown") == 0)
5372                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5373                         else
5374                                 ereport(ERROR,
5375                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5376                                                  errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5377                                                                 "recovery_target_action",
5378                                                                 item->value),
5379                                                  errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5380
5381                         ereport(DEBUG2,
5382                                         (errmsg_internal("recovery_target_action = '%s'",
5383                                                                          item->value)));
5384
5385                         recoveryTargetActionSet = true;
5386                 }
5387                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5388                 {
5389                         rtliGiven = true;
5390                         if (strcmp(item->value, "latest") == 0)
5391                                 rtli = 0;
5392                         else
5393                         {
5394                                 errno = 0;
5395                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5396                                 if (errno == EINVAL || errno == ERANGE)
5397                                         ereport(FATAL,
5398                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5399                                                          errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5400                                                                         item->value)));
5401                         }
5402                         if (rtli)
5403                                 ereport(DEBUG2,
5404                                                 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5405                         else
5406                                 ereport(DEBUG2,
5407                                                 (errmsg_internal("recovery_target_timeline = latest")));
5408                 }
5409                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5410                 {
5411                         errno = 0;
5412                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5413                         if (errno == EINVAL || errno == ERANGE)
5414                                 ereport(FATAL,
5415                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5416                                                  errmsg("recovery_target_xid is not a valid number: \"%s\"",
5417                                                                 item->value)));
5418                         ereport(DEBUG2,
5419                                         (errmsg_internal("recovery_target_xid = %u",
5420                                                                          recoveryTargetXid)));
5421                         recoveryTarget = RECOVERY_TARGET_XID;
5422                 }
5423                 else if (strcmp(item->name, "recovery_target_time") == 0)
5424                 {
5425                         recoveryTarget = RECOVERY_TARGET_TIME;
5426
5427                         if (strcmp(item->value, "epoch") == 0 ||
5428                                 strcmp(item->value, "infinity") == 0 ||
5429                                 strcmp(item->value, "-infinity") == 0 ||
5430                                 strcmp(item->value, "now") == 0 ||
5431                                 strcmp(item->value, "today") == 0 ||
5432                                 strcmp(item->value, "tomorrow") == 0 ||
5433                                 strcmp(item->value, "yesterday") == 0)
5434                                 ereport(FATAL,
5435                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5436                                                  errmsg("recovery_target_time is not a valid timestamp: \"%s\"",
5437                                                                 item->value)));
5438
5439                         /*
5440                          * Convert the time string given by the user to TimestampTz form.
5441                          */
5442                         recoveryTargetTime =
5443                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5444                                                                                                                 CStringGetDatum(item->value),
5445                                                                                                                 ObjectIdGetDatum(InvalidOid),
5446                                                                                                                 Int32GetDatum(-1)));
5447                         ereport(DEBUG2,
5448                                         (errmsg_internal("recovery_target_time = '%s'",
5449                                                                          timestamptz_to_str(recoveryTargetTime))));
5450                 }
5451                 else if (strcmp(item->name, "recovery_target_name") == 0)
5452                 {
5453                         recoveryTarget = RECOVERY_TARGET_NAME;
5454
5455                         recoveryTargetName = pstrdup(item->value);
5456                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5457                                 ereport(FATAL,
5458                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5459                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5460                                                                 MAXFNAMELEN - 1)));
5461
5462                         ereport(DEBUG2,
5463                                         (errmsg_internal("recovery_target_name = '%s'",
5464                                                                          recoveryTargetName)));
5465                 }
5466                 else if (strcmp(item->name, "recovery_target_lsn") == 0)
5467                 {
5468                         recoveryTarget = RECOVERY_TARGET_LSN;
5469
5470                         /*
5471                          * Convert the LSN string given by the user to XLogRecPtr form.
5472                          */
5473                         recoveryTargetLSN =
5474                                 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5475                                                                                                 CStringGetDatum(item->value),
5476                                                                                                 ObjectIdGetDatum(InvalidOid),
5477                                                                                                 Int32GetDatum(-1)));
5478                         ereport(DEBUG2,
5479                                         (errmsg_internal("recovery_target_lsn = '%X/%X'",
5480                                                                          (uint32) (recoveryTargetLSN >> 32),
5481                                                                          (uint32) recoveryTargetLSN)));
5482                 }
5483                 else if (strcmp(item->name, "recovery_target") == 0)
5484                 {
5485                         if (strcmp(item->value, "immediate") == 0)
5486                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5487                         else
5488                                 ereport(ERROR,
5489                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5490                                                  errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5491                                                                 "recovery_target",
5492                                                                 item->value),
5493                                                  errhint("The only allowed value is \"immediate\".")));
5494                         ereport(DEBUG2,
5495                                         (errmsg_internal("recovery_target = '%s'",
5496                                                                          item->value)));
5497                 }
5498                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5499                 {
5500                         /*
5501                          * does nothing if a recovery_target is not also set
5502                          */
5503                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5504                                 ereport(ERROR,
5505                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5506                                                  errmsg("parameter \"%s\" requires a Boolean value",
5507                                                                 "recovery_target_inclusive")));
5508                         ereport(DEBUG2,
5509                                         (errmsg_internal("recovery_target_inclusive = %s",
5510                                                                          item->value)));
5511                 }
5512                 else if (strcmp(item->name, "standby_mode") == 0)
5513                 {
5514                         if (!parse_bool(item->value, &StandbyModeRequested))
5515                                 ereport(ERROR,
5516                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5517                                                  errmsg("parameter \"%s\" requires a Boolean value",
5518                                                                 "standby_mode")));
5519                         ereport(DEBUG2,
5520                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5521                 }
5522                 else if (strcmp(item->name, "primary_conninfo") == 0)
5523                 {
5524                         PrimaryConnInfo = pstrdup(item->value);
5525                         ereport(DEBUG2,
5526                                         (errmsg_internal("primary_conninfo = '%s'",
5527                                                                          PrimaryConnInfo)));
5528                 }
5529                 else if (strcmp(item->name, "primary_slot_name") == 0)
5530                 {
5531                         ReplicationSlotValidateName(item->value, ERROR);
5532                         PrimarySlotName = pstrdup(item->value);
5533                         ereport(DEBUG2,
5534                                         (errmsg_internal("primary_slot_name = '%s'",
5535                                                                          PrimarySlotName)));
5536                 }
5537                 else if (strcmp(item->name, "trigger_file") == 0)
5538                 {
5539                         TriggerFile = pstrdup(item->value);
5540                         ereport(DEBUG2,
5541                                         (errmsg_internal("trigger_file = '%s'",
5542                                                                          TriggerFile)));
5543                 }
5544                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5545                 {
5546                         const char *hintmsg;
5547
5548                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5549                                                    &hintmsg))
5550                                 ereport(ERROR,
5551                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5552                                                  errmsg("parameter \"%s\" requires a temporal value",
5553                                                                 "recovery_min_apply_delay"),
5554                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5555                         ereport(DEBUG2,
5556                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5557                 }
5558                 else
5559                         ereport(FATAL,
5560                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5561                                          errmsg("unrecognized recovery parameter \"%s\"",
5562                                                         item->name)));
5563         }
5564
5565         /*
5566          * Check for compulsory parameters
5567          */
5568         if (StandbyModeRequested)
5569         {
5570                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5571                         ereport(WARNING,
5572                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5573                                                         RECOVERY_COMMAND_FILE),
5574                                          errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5575         }
5576         else
5577         {
5578                 if (recoveryRestoreCommand == NULL)
5579                         ereport(FATAL,
5580                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5581                                          errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5582                                                         RECOVERY_COMMAND_FILE)));
5583         }
5584
5585         /*
5586          * Override any inconsistent requests. Not that this is a change of
5587          * behaviour in 9.5; prior to this we simply ignored a request to pause if
5588          * hot_standby = off, which was surprising behaviour.
5589          */
5590         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5591                 recoveryTargetActionSet &&
5592                 !EnableHotStandby)
5593                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5594
5595         /*
5596          * We don't support standby_mode in standalone backends; that requires
5597          * other processes such as the WAL receiver to be alive.
5598          */
5599         if (StandbyModeRequested && !IsUnderPostmaster)
5600                 ereport(FATAL,
5601                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5602                                  errmsg("standby mode is not supported by single-user servers")));
5603
5604         /* Enable fetching from archive recovery area */
5605         ArchiveRecoveryRequested = true;
5606
5607         /*
5608          * If user specified recovery_target_timeline, validate it or compute the
5609          * "latest" value.  We can't do this until after we've gotten the restore
5610          * command and set InArchiveRecovery, because we need to fetch timeline
5611          * history files from the archive.
5612          */
5613         if (rtliGiven)
5614         {
5615                 if (rtli)
5616                 {
5617                         /* Timeline 1 does not have a history file, all else should */
5618                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5619                                 ereport(FATAL,
5620                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5621                                                  errmsg("recovery target timeline %u does not exist",
5622                                                                 rtli)));
5623                         recoveryTargetTLI = rtli;
5624                         recoveryTargetIsLatest = false;
5625                 }
5626                 else
5627                 {
5628                         /* We start the "latest" search from pg_control's timeline */
5629                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5630                         recoveryTargetIsLatest = true;
5631                 }
5632         }
5633
5634         FreeConfigVariables(head);
5635 }
5636
5637 /*
5638  * Exit archive-recovery state
5639  */
5640 static void
5641 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5642 {
5643         char            recoveryPath[MAXPGPATH];
5644         char            xlogfname[MAXFNAMELEN];
5645         XLogSegNo       endLogSegNo;
5646         XLogSegNo       startLogSegNo;
5647
5648         /* we always switch to a new timeline after archive recovery */
5649         Assert(endTLI != ThisTimeLineID);
5650
5651         /*
5652          * We are no longer in archive recovery state.
5653          */
5654         InArchiveRecovery = false;
5655
5656         /*
5657          * Update min recovery point one last time.
5658          */
5659         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5660
5661         /*
5662          * If the ending log segment is still open, close it (to avoid problems on
5663          * Windows with trying to rename or delete an open file).
5664          */
5665         if (readFile >= 0)
5666         {
5667                 close(readFile);
5668                 readFile = -1;
5669         }
5670
5671         /*
5672          * Calculate the last segment on the old timeline, and the first segment
5673          * on the new timeline. If the switch happens in the middle of a segment,
5674          * they are the same, but if the switch happens exactly at a segment
5675          * boundary, startLogSegNo will be endLogSegNo + 1.
5676          */
5677         XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5678         XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5679
5680         /*
5681          * Initialize the starting WAL segment for the new timeline. If the switch
5682          * happens in the middle of a segment, copy data from the last WAL segment
5683          * of the old timeline up to the switch point, to the starting WAL segment
5684          * on the new timeline.
5685          */
5686         if (endLogSegNo == startLogSegNo)
5687         {
5688                 /*
5689                  * Make a copy of the file on the new timeline.
5690                  *
5691                  * Writing WAL isn't allowed yet, so there are no locking
5692                  * considerations. But we should be just as tense as XLogFileInit to
5693                  * avoid emplacing a bogus file.
5694                  */
5695                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5696                                          XLogSegmentOffset(endOfLog, wal_segment_size));
5697         }
5698         else
5699         {
5700                 /*
5701                  * The switch happened at a segment boundary, so just create the next
5702                  * segment on the new timeline.
5703                  */
5704                 bool            use_existent = true;
5705                 int                     fd;
5706
5707                 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5708
5709                 if (close(fd))
5710                         ereport(ERROR,
5711                                         (errcode_for_file_access(),
5712                                          errmsg("could not close file \"%s\": %m",
5713                                                         XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5714         }
5715
5716         /*
5717          * Let's just make real sure there are not .ready or .done flags posted
5718          * for the new segment.
5719          */
5720         XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5721         XLogArchiveCleanup(xlogfname);
5722
5723         /*
5724          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5725          * of it.
5726          */
5727         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5728         unlink(recoveryPath);           /* ignore any error */
5729
5730         /* Get rid of any remaining recovered timeline-history file, too */
5731         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5732         unlink(recoveryPath);           /* ignore any error */
5733
5734         /*
5735          * Rename the config file out of the way, so that we don't accidentally
5736          * re-enter archive recovery mode in a subsequent crash.
5737          */
5738         unlink(RECOVERY_COMMAND_DONE);
5739         durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5740
5741         ereport(LOG,
5742                         (errmsg("archive recovery complete")));
5743 }
5744
5745 /*
5746  * Extract timestamp from WAL record.
5747  *
5748  * If the record contains a timestamp, returns true, and saves the timestamp
5749  * in *recordXtime. If the record type has no timestamp, returns false.
5750  * Currently, only transaction commit/abort records and restore points contain
5751  * timestamps.
5752  */
5753 static bool
5754 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5755 {
5756         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5757         uint8           xact_info = info & XLOG_XACT_OPMASK;
5758         uint8           rmid = XLogRecGetRmid(record);
5759
5760         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5761         {
5762                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5763                 return true;
5764         }
5765         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5766                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
5767         {
5768                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5769                 return true;
5770         }
5771         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5772                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
5773         {
5774                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5775                 return true;
5776         }
5777         return false;
5778 }
5779
5780 /*
5781  * For point-in-time recovery, this function decides whether we want to
5782  * stop applying the XLOG before the current record.
5783  *
5784  * Returns true if we are stopping, false otherwise. If stopping, some
5785  * information is saved in recoveryStopXid et al for use in annotating the
5786  * new timeline's history file.
5787  */
5788 static bool
5789 recoveryStopsBefore(XLogReaderState *record)
5790 {
5791         bool            stopsHere = false;
5792         uint8           xact_info;
5793         bool            isCommit;
5794         TimestampTz recordXtime = 0;
5795         TransactionId recordXid;
5796
5797         /* Check if we should stop as soon as reaching consistency */
5798         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5799         {
5800                 ereport(LOG,
5801                                 (errmsg("recovery stopping after reaching consistency")));
5802
5803                 recoveryStopAfter = false;
5804                 recoveryStopXid = InvalidTransactionId;
5805                 recoveryStopLSN = InvalidXLogRecPtr;
5806                 recoveryStopTime = 0;
5807                 recoveryStopName[0] = '\0';
5808                 return true;
5809         }
5810
5811         /* Check if target LSN has been reached */
5812         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5813                 !recoveryTargetInclusive &&
5814                 record->ReadRecPtr >= recoveryTargetLSN)
5815         {
5816                 recoveryStopAfter = false;
5817                 recoveryStopXid = InvalidTransactionId;
5818                 recoveryStopLSN = record->ReadRecPtr;
5819                 recoveryStopTime = 0;
5820                 recoveryStopName[0] = '\0';
5821                 ereport(LOG,
5822                                 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5823                                                 (uint32) (recoveryStopLSN >> 32),
5824                                                 (uint32) recoveryStopLSN)));
5825                 return true;
5826         }
5827
5828         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5829         if (XLogRecGetRmid(record) != RM_XACT_ID)
5830                 return false;
5831
5832         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5833
5834         if (xact_info == XLOG_XACT_COMMIT)
5835         {
5836                 isCommit = true;
5837                 recordXid = XLogRecGetXid(record);
5838         }
5839         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5840         {
5841                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5842                 xl_xact_parsed_commit parsed;
5843
5844                 isCommit = true;
5845                 ParseCommitRecord(XLogRecGetInfo(record),
5846                                                   xlrec,
5847                                                   &parsed);
5848                 recordXid = parsed.twophase_xid;
5849         }
5850         else if (xact_info == XLOG_XACT_ABORT)
5851         {
5852                 isCommit = false;
5853                 recordXid = XLogRecGetXid(record);
5854         }
5855         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5856         {
5857                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5858                 xl_xact_parsed_abort parsed;
5859
5860                 isCommit = true;
5861                 ParseAbortRecord(XLogRecGetInfo(record),
5862                                                  xlrec,
5863                                                  &parsed);
5864                 recordXid = parsed.twophase_xid;
5865         }
5866         else
5867                 return false;
5868
5869         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5870         {
5871                 /*
5872                  * There can be only one transaction end record with this exact
5873                  * transactionid
5874                  *
5875                  * when testing for an xid, we MUST test for equality only, since
5876                  * transactions are numbered in the order they start, not the order
5877                  * they complete. A higher numbered xid will complete before you about
5878                  * 50% of the time...
5879                  */
5880                 stopsHere = (recordXid == recoveryTargetXid);
5881         }
5882
5883         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5884                 getRecordTimestamp(record, &recordXtime))
5885         {
5886                 /*
5887                  * There can be many transactions that share the same commit time, so
5888                  * we stop after the last one, if we are inclusive, or stop at the
5889                  * first one if we are exclusive
5890                  */
5891                 if (recoveryTargetInclusive)
5892                         stopsHere = (recordXtime > recoveryTargetTime);
5893                 else
5894                         stopsHere = (recordXtime >= recoveryTargetTime);
5895         }
5896
5897         if (stopsHere)
5898         {
5899                 recoveryStopAfter = false;
5900                 recoveryStopXid = recordXid;
5901                 recoveryStopTime = recordXtime;
5902                 recoveryStopLSN = InvalidXLogRecPtr;
5903                 recoveryStopName[0] = '\0';
5904
5905                 if (isCommit)
5906                 {
5907                         ereport(LOG,
5908                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5909                                                         recoveryStopXid,
5910                                                         timestamptz_to_str(recoveryStopTime))));
5911                 }
5912                 else
5913                 {
5914                         ereport(LOG,
5915                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5916                                                         recoveryStopXid,
5917                                                         timestamptz_to_str(recoveryStopTime))));
5918                 }
5919         }
5920
5921         return stopsHere;
5922 }
5923
5924 /*
5925  * Same as recoveryStopsBefore, but called after applying the record.
5926  *
5927  * We also track the timestamp of the latest applied COMMIT/ABORT
5928  * record in XLogCtl->recoveryLastXTime.
5929  */
5930 static bool
5931 recoveryStopsAfter(XLogReaderState *record)
5932 {
5933         uint8           info;
5934         uint8           xact_info;
5935         uint8           rmid;
5936         TimestampTz recordXtime;
5937
5938         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5939         rmid = XLogRecGetRmid(record);
5940
5941         /*
5942          * There can be many restore points that share the same name; we stop at
5943          * the first one.
5944          */
5945         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5946                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5947         {
5948                 xl_restore_point *recordRestorePointData;
5949
5950                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5951
5952                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5953                 {
5954                         recoveryStopAfter = true;
5955                         recoveryStopXid = InvalidTransactionId;
5956                         recoveryStopLSN = InvalidXLogRecPtr;
5957                         (void) getRecordTimestamp(record, &recoveryStopTime);
5958                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5959
5960                         ereport(LOG,
5961                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
5962                                                         recoveryStopName,
5963                                                         timestamptz_to_str(recoveryStopTime))));
5964                         return true;
5965                 }
5966         }
5967
5968         /* Check if the target LSN has been reached */
5969         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5970                 recoveryTargetInclusive &&
5971                 record->ReadRecPtr >= recoveryTargetLSN)
5972         {
5973                 recoveryStopAfter = true;
5974                 recoveryStopXid = InvalidTransactionId;
5975                 recoveryStopLSN = record->ReadRecPtr;
5976                 recoveryStopTime = 0;
5977                 recoveryStopName[0] = '\0';
5978                 ereport(LOG,
5979                                 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5980                                                 (uint32) (recoveryStopLSN >> 32),
5981                                                 (uint32) recoveryStopLSN)));
5982                 return true;
5983         }
5984
5985         if (rmid != RM_XACT_ID)
5986                 return false;
5987
5988         xact_info = info & XLOG_XACT_OPMASK;
5989
5990         if (xact_info == XLOG_XACT_COMMIT ||
5991                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5992                 xact_info == XLOG_XACT_ABORT ||
5993                 xact_info == XLOG_XACT_ABORT_PREPARED)
5994         {
5995                 TransactionId recordXid;
5996
5997                 /* Update the last applied transaction timestamp */
5998                 if (getRecordTimestamp(record, &recordXtime))
5999                         SetLatestXTime(recordXtime);
6000
6001                 /* Extract the XID of the committed/aborted transaction */
6002                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
6003                 {
6004                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
6005                         xl_xact_parsed_commit parsed;
6006
6007                         ParseCommitRecord(XLogRecGetInfo(record),
6008                                                           xlrec,
6009                                                           &parsed);
6010                         recordXid = parsed.twophase_xid;
6011                 }
6012                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
6013                 {
6014                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
6015                         xl_xact_parsed_abort parsed;
6016
6017                         ParseAbortRecord(XLogRecGetInfo(record),
6018                                                          xlrec,
6019                                                          &parsed);
6020                         recordXid = parsed.twophase_xid;
6021                 }
6022                 else
6023                         recordXid = XLogRecGetXid(record);
6024
6025                 /*
6026                  * There can be only one transaction end record with this exact
6027                  * transactionid
6028                  *
6029                  * when testing for an xid, we MUST test for equality only, since
6030                  * transactions are numbered in the order they start, not the order
6031                  * they complete. A higher numbered xid will complete before you about
6032                  * 50% of the time...
6033                  */
6034                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
6035                         recordXid == recoveryTargetXid)
6036                 {
6037                         recoveryStopAfter = true;
6038                         recoveryStopXid = recordXid;
6039                         recoveryStopTime = recordXtime;
6040                         recoveryStopLSN = InvalidXLogRecPtr;
6041                         recoveryStopName[0] = '\0';
6042
6043                         if (xact_info == XLOG_XACT_COMMIT ||
6044                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
6045                         {
6046                                 ereport(LOG,
6047                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
6048                                                                 recoveryStopXid,
6049                                                                 timestamptz_to_str(recoveryStopTime))));
6050                         }
6051                         else if (xact_info == XLOG_XACT_ABORT ||
6052                                          xact_info == XLOG_XACT_ABORT_PREPARED)
6053                         {
6054                                 ereport(LOG,
6055                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
6056                                                                 recoveryStopXid,
6057                                                                 timestamptz_to_str(recoveryStopTime))));
6058                         }
6059                         return true;
6060                 }
6061         }
6062
6063         /* Check if we should stop as soon as reaching consistency */
6064         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6065         {
6066                 ereport(LOG,
6067                                 (errmsg("recovery stopping after reaching consistency")));
6068
6069                 recoveryStopAfter = true;
6070                 recoveryStopXid = InvalidTransactionId;
6071                 recoveryStopTime = 0;
6072                 recoveryStopLSN = InvalidXLogRecPtr;
6073                 recoveryStopName[0] = '\0';
6074                 return true;
6075         }
6076
6077         return false;
6078 }
6079
6080 /*
6081  * Wait until shared recoveryPause flag is cleared.
6082  *
6083  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6084  * Probably not worth the trouble though.  This state shouldn't be one that
6085  * anyone cares about server power consumption in.
6086  */
6087 static void
6088 recoveryPausesHere(void)
6089 {
6090         /* Don't pause unless users can connect! */
6091         if (!LocalHotStandbyActive)
6092                 return;
6093
6094         ereport(LOG,
6095                         (errmsg("recovery has paused"),
6096                          errhint("Execute pg_wal_replay_resume() to continue.")));
6097
6098         while (RecoveryIsPaused())
6099         {
6100                 pg_usleep(1000000L);    /* 1000 ms */
6101                 HandleStartupProcInterrupts();
6102         }
6103 }
6104
6105 bool
6106 RecoveryIsPaused(void)
6107 {
6108         bool            recoveryPause;
6109
6110         SpinLockAcquire(&XLogCtl->info_lck);
6111         recoveryPause = XLogCtl->recoveryPause;
6112         SpinLockRelease(&XLogCtl->info_lck);
6113
6114         return recoveryPause;
6115 }
6116
6117 void
6118 SetRecoveryPause(bool recoveryPause)
6119 {
6120         SpinLockAcquire(&XLogCtl->info_lck);
6121         XLogCtl->recoveryPause = recoveryPause;
6122         SpinLockRelease(&XLogCtl->info_lck);
6123 }
6124
6125 /*
6126  * When recovery_min_apply_delay is set, we wait long enough to make sure
6127  * certain record types are applied at least that interval behind the master.
6128  *
6129  * Returns true if we waited.
6130  *
6131  * Note that the delay is calculated between the WAL record log time and
6132  * the current time on standby. We would prefer to keep track of when this
6133  * standby received each WAL record, which would allow a more consistent
6134  * approach and one not affected by time synchronisation issues, but that
6135  * is significantly more effort and complexity for little actual gain in
6136  * usability.
6137  */
6138 static bool
6139 recoveryApplyDelay(XLogReaderState *record)
6140 {
6141         uint8           xact_info;
6142         TimestampTz xtime;
6143         long            secs;
6144         int                     microsecs;
6145
6146         /* nothing to do if no delay configured */
6147         if (recovery_min_apply_delay <= 0)
6148                 return false;
6149
6150         /* no delay is applied on a database not yet consistent */
6151         if (!reachedConsistency)
6152                 return false;
6153
6154         /*
6155          * Is it a COMMIT record?
6156          *
6157          * We deliberately choose not to delay aborts since they have no effect on
6158          * MVCC. We already allow replay of records that don't have a timestamp,
6159          * so there is already opportunity for issues caused by early conflicts on
6160          * standbys.
6161          */
6162         if (XLogRecGetRmid(record) != RM_XACT_ID)
6163                 return false;
6164
6165         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6166
6167         if (xact_info != XLOG_XACT_COMMIT &&
6168                 xact_info != XLOG_XACT_COMMIT_PREPARED)
6169                 return false;
6170
6171         if (!getRecordTimestamp(record, &xtime))
6172                 return false;
6173
6174         recoveryDelayUntilTime =
6175                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6176
6177         /*
6178          * Exit without arming the latch if it's already past time to apply this
6179          * record
6180          */
6181         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6182                                                 &secs, &microsecs);
6183         if (secs <= 0 && microsecs <= 0)
6184                 return false;
6185
6186         while (true)
6187         {
6188                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6189
6190                 /* might change the trigger file's location */
6191                 HandleStartupProcInterrupts();
6192
6193                 if (CheckForStandbyTrigger())
6194                         break;
6195
6196                 /*
6197                  * Wait for difference between GetCurrentTimestamp() and
6198                  * recoveryDelayUntilTime
6199                  */
6200                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6201                                                         &secs, &microsecs);
6202
6203                 /* NB: We're ignoring waits below min_apply_delay's resolution. */
6204                 if (secs <= 0 && microsecs / 1000 <= 0)
6205                         break;
6206
6207                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6208                          secs, microsecs / 1000);
6209
6210                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6211                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6212                                   secs * 1000L + microsecs / 1000,
6213                                   WAIT_EVENT_RECOVERY_APPLY_DELAY);
6214         }
6215         return true;
6216 }
6217
6218 /*
6219  * Save timestamp of latest processed commit/abort record.
6220  *
6221  * We keep this in XLogCtl, not a simple static variable, so that it can be
6222  * seen by processes other than the startup process.  Note in particular
6223  * that CreateRestartPoint is executed in the checkpointer.
6224  */
6225 static void
6226 SetLatestXTime(TimestampTz xtime)
6227 {
6228         SpinLockAcquire(&XLogCtl->info_lck);
6229         XLogCtl->recoveryLastXTime = xtime;
6230         SpinLockRelease(&XLogCtl->info_lck);
6231 }
6232
6233 /*
6234  * Fetch timestamp of latest processed commit/abort record.
6235  */
6236 TimestampTz
6237 GetLatestXTime(void)
6238 {
6239         TimestampTz xtime;
6240
6241         SpinLockAcquire(&XLogCtl->info_lck);
6242         xtime = XLogCtl->recoveryLastXTime;
6243         SpinLockRelease(&XLogCtl->info_lck);
6244
6245         return xtime;
6246 }
6247
6248 /*
6249  * Save timestamp of the next chunk of WAL records to apply.
6250  *
6251  * We keep this in XLogCtl, not a simple static variable, so that it can be
6252  * seen by all backends.
6253  */
6254 static void
6255 SetCurrentChunkStartTime(TimestampTz xtime)
6256 {
6257         SpinLockAcquire(&XLogCtl->info_lck);
6258         XLogCtl->currentChunkStartTime = xtime;
6259         SpinLockRelease(&XLogCtl->info_lck);
6260 }
6261
6262 /*
6263  * Fetch timestamp of latest processed commit/abort record.
6264  * Startup process maintains an accurate local copy in XLogReceiptTime
6265  */
6266 TimestampTz
6267 GetCurrentChunkReplayStartTime(void)
6268 {
6269         TimestampTz xtime;
6270
6271         SpinLockAcquire(&XLogCtl->info_lck);
6272         xtime = XLogCtl->currentChunkStartTime;
6273         SpinLockRelease(&XLogCtl->info_lck);
6274
6275         return xtime;
6276 }
6277
6278 /*
6279  * Returns time of receipt of current chunk of XLOG data, as well as
6280  * whether it was received from streaming replication or from archives.
6281  */
6282 void
6283 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6284 {
6285         /*
6286          * This must be executed in the startup process, since we don't export the
6287          * relevant state to shared memory.
6288          */
6289         Assert(InRecovery);
6290
6291         *rtime = XLogReceiptTime;
6292         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6293 }
6294
6295 /*
6296  * Note that text field supplied is a parameter name and does not require
6297  * translation
6298  */
6299 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6300 do { \
6301         if ((currValue) < (minValue)) \
6302                 ereport(ERROR, \
6303                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6304                                  errmsg("hot standby is not possible because " \
6305                                                 "%s = %d is a lower setting than on the master server " \
6306                                                 "(its value was %d)", \
6307                                                 param_name, \
6308                                                 currValue, \
6309                                                 minValue))); \
6310 } while(0)
6311
6312 /*
6313  * Check to see if required parameters are set high enough on this server
6314  * for various aspects of recovery operation.
6315  *
6316  * Note that all the parameters which this function tests need to be
6317  * listed in Administrator's Overview section in high-availability.sgml.
6318  * If you change them, don't forget to update the list.
6319  */
6320 static void
6321 CheckRequiredParameterValues(void)
6322 {
6323         /*
6324          * For archive recovery, the WAL must be generated with at least 'replica'
6325          * wal_level.
6326          */
6327         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6328         {
6329                 ereport(WARNING,
6330                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6331                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6332         }
6333
6334         /*
6335          * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6336          * must have at least as many backend slots as the primary.
6337          */
6338         if (ArchiveRecoveryRequested && EnableHotStandby)
6339         {
6340                 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6341                         ereport(ERROR,
6342                                         (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6343                                          errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6344
6345                 /* We ignore autovacuum_max_workers when we make this test. */
6346                 RecoveryRequiresIntParameter("max_connections",
6347                                                                          MaxConnections,
6348                                                                          ControlFile->MaxConnections);
6349                 RecoveryRequiresIntParameter("max_worker_processes",
6350                                                                          max_worker_processes,
6351                                                                          ControlFile->max_worker_processes);
6352                 RecoveryRequiresIntParameter("max_prepared_transactions",
6353                                                                          max_prepared_xacts,
6354                                                                          ControlFile->max_prepared_xacts);
6355                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6356                                                                          max_locks_per_xact,
6357                                                                          ControlFile->max_locks_per_xact);
6358         }
6359 }
6360
6361 /*
6362  * This must be called ONCE during postmaster or standalone-backend startup
6363  */
6364 void
6365 StartupXLOG(void)
6366 {
6367         XLogCtlInsert *Insert;
6368         CheckPoint      checkPoint;
6369         bool            wasShutdown;
6370         bool            reachedStopPoint = false;
6371         bool            haveBackupLabel = false;
6372         bool            haveTblspcMap = false;
6373         XLogRecPtr      RecPtr,
6374                                 checkPointLoc,
6375                                 EndOfLog;
6376         TimeLineID      EndOfLogTLI;
6377         TimeLineID      PrevTimeLineID;
6378         XLogRecord *record;
6379         TransactionId oldestActiveXID;
6380         bool            backupEndRequired = false;
6381         bool            backupFromStandby = false;
6382         DBState         dbstate_at_startup;
6383         XLogReaderState *xlogreader;
6384         XLogPageReadPrivate private;
6385         bool            fast_promoted = false;
6386         struct stat st;
6387
6388         /*
6389          * We should have an aux process resource owner to use, and we should not
6390          * be in a transaction that's installed some other resowner.
6391          */
6392         Assert(AuxProcessResourceOwner != NULL);
6393         Assert(CurrentResourceOwner == NULL ||
6394                    CurrentResourceOwner == AuxProcessResourceOwner);
6395         CurrentResourceOwner = AuxProcessResourceOwner;
6396
6397         /*
6398          * Verify XLOG status looks valid.
6399          */
6400         if (ControlFile->state < DB_SHUTDOWNED ||
6401                 ControlFile->state > DB_IN_PRODUCTION ||
6402                 !XRecOffIsValid(ControlFile->checkPoint))
6403                 ereport(FATAL,
6404                                 (errmsg("control file contains invalid data")));
6405
6406         if (ControlFile->state == DB_SHUTDOWNED)
6407         {
6408                 /* This is the expected case, so don't be chatty in standalone mode */
6409                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6410                                 (errmsg("database system was shut down at %s",
6411                                                 str_time(ControlFile->time))));
6412         }
6413         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6414                 ereport(LOG,
6415                                 (errmsg("database system was shut down in recovery at %s",
6416                                                 str_time(ControlFile->time))));
6417         else if (ControlFile->state == DB_SHUTDOWNING)
6418                 ereport(LOG,
6419                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6420                                                 str_time(ControlFile->time))));
6421         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6422                 ereport(LOG,
6423                                 (errmsg("database system was interrupted while in recovery at %s",
6424                                                 str_time(ControlFile->time)),
6425                                  errhint("This probably means that some data is corrupted and"
6426                                                  " you will have to use the last backup for recovery.")));
6427         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6428                 ereport(LOG,
6429                                 (errmsg("database system was interrupted while in recovery at log time %s",
6430                                                 str_time(ControlFile->checkPointCopy.time)),
6431                                  errhint("If this has occurred more than once some data might be corrupted"
6432                                                  " and you might need to choose an earlier recovery target.")));
6433         else if (ControlFile->state == DB_IN_PRODUCTION)
6434                 ereport(LOG,
6435                                 (errmsg("database system was interrupted; last known up at %s",
6436                                                 str_time(ControlFile->time))));
6437
6438         /* This is just to allow attaching to startup process with a debugger */
6439 #ifdef XLOG_REPLAY_DELAY
6440         if (ControlFile->state != DB_SHUTDOWNED)
6441                 pg_usleep(60000000L);
6442 #endif
6443
6444         /*
6445          * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6446          * someone has performed a copy for PITR, these directories may have been
6447          * excluded and need to be re-created.
6448          */
6449         ValidateXLOGDirectoryStructure();
6450
6451         /*----------
6452          * If we previously crashed, perform a couple of actions:
6453          *      - The pg_wal directory may still include some temporary WAL segments
6454          * used when creating a new segment, so perform some clean up to not
6455          * bloat this path.  This is done first as there is no point to sync this
6456          * temporary data.
6457          *      - There might be data which we had written, intending to fsync it,
6458          * but which we had not actually fsync'd yet. Therefore, a power failure
6459          * in the near future might cause earlier unflushed writes to be lost,
6460          * even though more recent data written to disk from here on would be
6461          * persisted.  To avoid that, fsync the entire data directory.
6462          *---------
6463          */
6464         if (ControlFile->state != DB_SHUTDOWNED &&
6465                 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6466         {
6467                 RemoveTempXlogFiles();
6468                 SyncDataDirectory();
6469         }
6470
6471         /*
6472          * Initialize on the assumption we want to recover to the latest timeline
6473          * that's active according to pg_control.
6474          */
6475         if (ControlFile->minRecoveryPointTLI >
6476                 ControlFile->checkPointCopy.ThisTimeLineID)
6477                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6478         else
6479                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6480
6481         /*
6482          * Check for recovery control file, and if so set up state for offline
6483          * recovery
6484          */
6485         readRecoveryCommandFile();
6486
6487         /*
6488          * Save archive_cleanup_command in shared memory so that other processes
6489          * can see it.
6490          */
6491         strlcpy(XLogCtl->archiveCleanupCommand,
6492                         archiveCleanupCommand ? archiveCleanupCommand : "",
6493                         sizeof(XLogCtl->archiveCleanupCommand));
6494
6495         if (ArchiveRecoveryRequested)
6496         {
6497                 if (StandbyModeRequested)
6498                         ereport(LOG,
6499                                         (errmsg("entering standby mode")));
6500                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6501                         ereport(LOG,
6502                                         (errmsg("starting point-in-time recovery to XID %u",
6503                                                         recoveryTargetXid)));
6504                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6505                         ereport(LOG,
6506                                         (errmsg("starting point-in-time recovery to %s",
6507                                                         timestamptz_to_str(recoveryTargetTime))));
6508                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6509                         ereport(LOG,
6510                                         (errmsg("starting point-in-time recovery to \"%s\"",
6511                                                         recoveryTargetName)));
6512                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6513                         ereport(LOG,
6514                                         (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6515                                                         (uint32) (recoveryTargetLSN >> 32),
6516                                                         (uint32) recoveryTargetLSN)));
6517                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6518                         ereport(LOG,
6519                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6520                 else
6521                         ereport(LOG,
6522                                         (errmsg("starting archive recovery")));
6523         }
6524
6525         /*
6526          * Take ownership of the wakeup latch if we're going to sleep during
6527          * recovery.
6528          */
6529         if (StandbyModeRequested)
6530                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6531
6532         /* Set up XLOG reader facility */
6533         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6534         xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
6535         if (!xlogreader)
6536                 ereport(ERROR,
6537                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6538                                  errmsg("out of memory"),
6539                                  errdetail("Failed while allocating a WAL reading processor.")));
6540         xlogreader->system_identifier = ControlFile->system_identifier;
6541
6542         /*
6543          * Allocate pages dedicated to WAL consistency checks, those had better be
6544          * aligned.
6545          */
6546         replay_image_masked = (char *) palloc(BLCKSZ);
6547         master_image_masked = (char *) palloc(BLCKSZ);
6548
6549         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6550                                                   &backupFromStandby))
6551         {
6552                 List       *tablespaces = NIL;
6553
6554                 /*
6555                  * Archive recovery was requested, and thanks to the backup label
6556                  * file, we know how far we need to replay to reach consistency. Enter
6557                  * archive recovery directly.
6558                  */
6559                 InArchiveRecovery = true;
6560                 if (StandbyModeRequested)
6561                         StandbyMode = true;
6562
6563                 /*
6564                  * When a backup_label file is present, we want to roll forward from
6565                  * the checkpoint it identifies, rather than using pg_control.
6566                  */
6567                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6568                 if (record != NULL)
6569                 {
6570                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6571                         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6572                         ereport(DEBUG1,
6573                                         (errmsg("checkpoint record is at %X/%X",
6574                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6575                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6576
6577                         /*
6578                          * Make sure that REDO location exists. This may not be the case
6579                          * if there was a crash during an online backup, which left a
6580                          * backup_label around that references a WAL segment that's
6581                          * already been archived.
6582                          */
6583                         if (checkPoint.redo < checkPointLoc)
6584                         {
6585                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6586                                         ereport(FATAL,
6587                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6588                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6589                         }
6590                 }
6591                 else
6592                 {
6593                         ereport(FATAL,
6594                                         (errmsg("could not locate required checkpoint record"),
6595                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6596                         wasShutdown = false;    /* keep compiler quiet */
6597                 }
6598
6599                 /* read the tablespace_map file if present and create symlinks. */
6600                 if (read_tablespace_map(&tablespaces))
6601                 {
6602                         ListCell   *lc;
6603
6604                         foreach(lc, tablespaces)
6605                         {
6606                                 tablespaceinfo *ti = lfirst(lc);
6607                                 char       *linkloc;
6608
6609                                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6610
6611                                 /*
6612                                  * Remove the existing symlink if any and Create the symlink
6613                                  * under PGDATA.
6614                                  */
6615                                 remove_tablespace_symlink(linkloc);
6616
6617                                 if (symlink(ti->path, linkloc) < 0)
6618                                         ereport(ERROR,
6619                                                         (errcode_for_file_access(),
6620                                                          errmsg("could not create symbolic link \"%s\": %m",
6621                                                                         linkloc)));
6622
6623                                 pfree(ti->oid);
6624                                 pfree(ti->path);
6625                                 pfree(ti);
6626                         }
6627
6628                         /* set flag to delete it later */
6629                         haveTblspcMap = true;
6630                 }
6631
6632                 /* set flag to delete it later */
6633                 haveBackupLabel = true;
6634         }
6635         else
6636         {
6637                 /*
6638                  * If tablespace_map file is present without backup_label file, there
6639                  * is no use of such file.  There is no harm in retaining it, but it
6640                  * is better to get rid of the map file so that we don't have any
6641                  * redundant file in data directory and it will avoid any sort of
6642                  * confusion.  It seems prudent though to just rename the file out of
6643                  * the way rather than delete it completely, also we ignore any error
6644                  * that occurs in rename operation as even if map file is present
6645                  * without backup_label file, it is harmless.
6646                  */
6647                 if (stat(TABLESPACE_MAP, &st) == 0)
6648                 {
6649                         unlink(TABLESPACE_MAP_OLD);
6650                         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6651                                 ereport(LOG,
6652                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6653                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6654                                                  errdetail("File \"%s\" was renamed to \"%s\".",
6655                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6656                         else
6657                                 ereport(LOG,
6658                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6659                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6660                                                  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6661                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6662                 }
6663
6664                 /*
6665                  * It's possible that archive recovery was requested, but we don't
6666                  * know how far we need to replay the WAL before we reach consistency.
6667                  * This can happen for example if a base backup is taken from a
6668                  * running server using an atomic filesystem snapshot, without calling
6669                  * pg_start/stop_backup. Or if you just kill a running master server
6670                  * and put it into archive recovery by creating a recovery.conf file.
6671                  *
6672                  * Our strategy in that case is to perform crash recovery first,
6673                  * replaying all the WAL present in pg_wal, and only enter archive
6674                  * recovery after that.
6675                  *
6676                  * But usually we already know how far we need to replay the WAL (up
6677                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6678                  * end-of-backup record), and we can enter archive recovery directly.
6679                  */
6680                 if (ArchiveRecoveryRequested &&
6681                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6682                          ControlFile->backupEndRequired ||
6683                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6684                          ControlFile->state == DB_SHUTDOWNED))
6685                 {
6686                         InArchiveRecovery = true;
6687                         if (StandbyModeRequested)
6688                                 StandbyMode = true;
6689                 }
6690
6691                 /* Get the last valid checkpoint record. */
6692                 checkPointLoc = ControlFile->checkPoint;
6693                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6694                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6695                 if (record != NULL)
6696                 {
6697                         ereport(DEBUG1,
6698                                         (errmsg("checkpoint record is at %X/%X",
6699                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6700                 }
6701                 else
6702                 {
6703                         /*
6704                          * We used to attempt to go back to a secondary checkpoint record
6705                          * here, but only when not in standby_mode. We now just fail if we
6706                          * can't read the last checkpoint because this allows us to
6707                          * simplify processing around checkpoints.
6708                          */
6709                         ereport(PANIC,
6710                                         (errmsg("could not locate a valid checkpoint record")));
6711                 }
6712                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6713                 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6714         }
6715
6716         /*
6717          * Clear out any old relcache cache files.  This is *necessary* if we do
6718          * any WAL replay, since that would probably result in the cache files
6719          * being out of sync with database reality.  In theory we could leave them
6720          * in place if the database had been cleanly shut down, but it seems
6721          * safest to just remove them always and let them be rebuilt during the
6722          * first backend startup.  These files needs to be removed from all
6723          * directories including pg_tblspc, however the symlinks are created only
6724          * after reading tablespace_map file in case of archive recovery from
6725          * backup, so needs to clear old relcache files here after creating
6726          * symlinks.
6727          */
6728         RelationCacheInitFileRemove();
6729
6730         /*
6731          * If the location of the checkpoint record is not on the expected
6732          * timeline in the history of the requested timeline, we cannot proceed:
6733          * the backup is not part of the history of the requested timeline.
6734          */
6735         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6736                                                                  * record */
6737         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6738                 checkPoint.ThisTimeLineID)
6739         {
6740                 XLogRecPtr      switchpoint;
6741
6742                 /*
6743                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6744                  * not in expectedTLEs at all.
6745                  */
6746                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6747                 ereport(FATAL,
6748                                 (errmsg("requested timeline %u is not a child of this server's history",
6749                                                 recoveryTargetTLI),
6750                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6751                                                    (uint32) (ControlFile->checkPoint >> 32),
6752                                                    (uint32) ControlFile->checkPoint,
6753                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6754                                                    (uint32) (switchpoint >> 32),
6755                                                    (uint32) switchpoint)));
6756         }
6757
6758         /*
6759          * The min recovery point should be part of the requested timeline's
6760          * history, too.
6761          */
6762         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6763                 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6764                 ControlFile->minRecoveryPointTLI)
6765                 ereport(FATAL,
6766                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6767                                                 recoveryTargetTLI,
6768                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6769                                                 (uint32) ControlFile->minRecoveryPoint,
6770                                                 ControlFile->minRecoveryPointTLI)));
6771
6772         LastRec = RecPtr = checkPointLoc;
6773
6774         ereport(DEBUG1,
6775                         (errmsg_internal("redo record is at %X/%X; shutdown %s",
6776                                                          (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6777                                                          wasShutdown ? "true" : "false")));
6778         ereport(DEBUG1,
6779                         (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6780                                                          checkPoint.nextXidEpoch, checkPoint.nextXid,
6781                                                          checkPoint.nextOid)));
6782         ereport(DEBUG1,
6783                         (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6784                                                          checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6785         ereport(DEBUG1,
6786                         (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6787                                                          checkPoint.oldestXid, checkPoint.oldestXidDB)));
6788         ereport(DEBUG1,
6789                         (errmsg_internal("oldest MultiXactId: %u, in database %u",
6790                                                          checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6791         ereport(DEBUG1,
6792                         (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6793                                                          checkPoint.oldestCommitTsXid,
6794                                                          checkPoint.newestCommitTsXid)));
6795         if (!TransactionIdIsNormal(checkPoint.nextXid))
6796                 ereport(PANIC,
6797                                 (errmsg("invalid next transaction ID")));
6798
6799         /* initialize shared memory variables from the checkpoint record */
6800         ShmemVariableCache->nextXid = checkPoint.nextXid;
6801         ShmemVariableCache->nextOid = checkPoint.nextOid;
6802         ShmemVariableCache->oidCount = 0;
6803         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6804         AdvanceOldestClogXid(checkPoint.oldestXid);
6805         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6806         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6807         SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6808                                          checkPoint.newestCommitTsXid);
6809         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6810         XLogCtl->ckptXid = checkPoint.nextXid;
6811
6812         /*
6813          * Initialize replication slots, before there's a chance to remove
6814          * required resources.
6815          */
6816         StartupReplicationSlots();
6817
6818         /*
6819          * Startup logical state, needs to be setup now so we have proper data
6820          * during crash recovery.
6821          */
6822         StartupReorderBuffer();
6823
6824         /*
6825          * Startup MultiXact. We need to do this early to be able to replay
6826          * truncations.
6827          */
6828         StartupMultiXact();
6829
6830         /*
6831          * Ditto commit timestamps.  In a standby, we do it if setting is enabled
6832          * in ControlFile; in a master we base the decision on the GUC itself.
6833          */
6834         if (ArchiveRecoveryRequested ?
6835                 ControlFile->track_commit_timestamp : track_commit_timestamp)
6836                 StartupCommitTs();
6837
6838         /*
6839          * Recover knowledge about replay progress of known replication partners.
6840          */
6841         StartupReplicationOrigin();
6842
6843         /*
6844          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6845          * control file. On recovery, all unlogged relations are blown away, so
6846          * the unlogged LSN counter can be reset too.
6847          */
6848         if (ControlFile->state == DB_SHUTDOWNED)
6849                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6850         else
6851                 XLogCtl->unloggedLSN = 1;
6852
6853         /*
6854          * We must replay WAL entries using the same TimeLineID they were created
6855          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6856          * also xlog_redo()).
6857          */
6858         ThisTimeLineID = checkPoint.ThisTimeLineID;
6859
6860         /*
6861          * Copy any missing timeline history files between 'now' and the recovery
6862          * target timeline from archive to pg_wal. While we don't need those files
6863          * ourselves - the history file of the recovery target timeline covers all
6864          * the previous timelines in the history too - a cascading standby server
6865          * might be interested in them. Or, if you archive the WAL from this
6866          * server to a different archive than the master, it'd be good for all the
6867          * history files to get archived there after failover, so that you can use
6868          * one of the old timelines as a PITR target. Timeline history files are
6869          * small, so it's better to copy them unnecessarily than not copy them and
6870          * regret later.
6871          */
6872         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6873
6874         /*
6875          * Before running in recovery, scan pg_twophase and fill in its status to
6876          * be able to work on entries generated by redo.  Doing a scan before
6877          * taking any recovery action has the merit to discard any 2PC files that
6878          * are newer than the first record to replay, saving from any conflicts at
6879          * replay.  This avoids as well any subsequent scans when doing recovery
6880          * of the on-disk two-phase data.
6881          */
6882         restoreTwoPhaseData();
6883
6884         lastFullPageWrites = checkPoint.fullPageWrites;
6885
6886         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6887         doPageWrites = lastFullPageWrites;
6888
6889         if (RecPtr < checkPoint.redo)
6890                 ereport(PANIC,
6891                                 (errmsg("invalid redo in checkpoint record")));
6892
6893         /*
6894          * Check whether we need to force recovery from WAL.  If it appears to
6895          * have been a clean shutdown and we did not have a recovery.conf file,
6896          * then assume no recovery needed.
6897          */
6898         if (checkPoint.redo < RecPtr)
6899         {
6900                 if (wasShutdown)
6901                         ereport(PANIC,
6902                                         (errmsg("invalid redo record in shutdown checkpoint")));
6903                 InRecovery = true;
6904         }
6905         else if (ControlFile->state != DB_SHUTDOWNED)
6906                 InRecovery = true;
6907         else if (ArchiveRecoveryRequested)
6908         {
6909                 /* force recovery due to presence of recovery.conf */
6910                 InRecovery = true;
6911         }
6912
6913         /* REDO */
6914         if (InRecovery)
6915         {
6916                 int                     rmid;
6917
6918                 /*
6919                  * Update pg_control to show that we are recovering and to show the
6920                  * selected checkpoint as the place we are starting from. We also mark
6921                  * pg_control with any minimum recovery stop point obtained from a
6922                  * backup history file.
6923                  */
6924                 dbstate_at_startup = ControlFile->state;
6925                 if (InArchiveRecovery)
6926                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6927                 else
6928                 {
6929                         ereport(LOG,
6930                                         (errmsg("database system was not properly shut down; "
6931                                                         "automatic recovery in progress")));
6932                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6933                                 ereport(LOG,
6934                                                 (errmsg("crash recovery starts in timeline %u "
6935                                                                 "and has target timeline %u",
6936                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6937                                                                 recoveryTargetTLI)));
6938                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6939                 }
6940                 ControlFile->checkPoint = checkPointLoc;
6941                 ControlFile->checkPointCopy = checkPoint;
6942                 if (InArchiveRecovery)
6943                 {
6944                         /* initialize minRecoveryPoint if not set yet */
6945                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6946                         {
6947                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6948                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6949                         }
6950                 }
6951
6952                 /*
6953                  * Set backupStartPoint if we're starting recovery from a base backup.
6954                  *
6955                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
6956                  * location if we're starting recovery from a base backup which was
6957                  * taken from a standby. In this case, the database system status in
6958                  * pg_control must indicate that the database was already in recovery.
6959                  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6960                  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6961                  * before reaching this point; e.g. because restore_command or
6962                  * primary_conninfo were faulty.
6963                  *
6964                  * Any other state indicates that the backup somehow became corrupted
6965                  * and we can't sensibly continue with recovery.
6966                  */
6967                 if (haveBackupLabel)
6968                 {
6969                         ControlFile->backupStartPoint = checkPoint.redo;
6970                         ControlFile->backupEndRequired = backupEndRequired;
6971
6972                         if (backupFromStandby)
6973                         {
6974                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6975                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6976                                         ereport(FATAL,
6977                                                         (errmsg("backup_label contains data inconsistent with control file"),
6978                                                          errhint("This means that the backup is corrupted and you will "
6979                                                                          "have to use another backup for recovery.")));
6980                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6981                         }
6982                 }
6983                 ControlFile->time = (pg_time_t) time(NULL);
6984                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6985                 UpdateControlFile();
6986
6987                 /*
6988                  * Initialize our local copy of minRecoveryPoint.  When doing crash
6989                  * recovery we want to replay up to the end of WAL.  Particularly, in
6990                  * the case of a promoted standby minRecoveryPoint value in the
6991                  * control file is only updated after the first checkpoint.  However,
6992                  * if the instance crashes before the first post-recovery checkpoint
6993                  * is completed then recovery will use a stale location causing the
6994                  * startup process to think that there are still invalid page
6995                  * references when checking for data consistency.
6996                  */
6997                 if (InArchiveRecovery)
6998                 {
6999                         minRecoveryPoint = ControlFile->minRecoveryPoint;
7000                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7001                 }
7002                 else
7003                 {
7004                         minRecoveryPoint = InvalidXLogRecPtr;
7005                         minRecoveryPointTLI = 0;
7006                 }
7007
7008                 /*
7009                  * Reset pgstat data, because it may be invalid after recovery.
7010                  */
7011                 pgstat_reset_all();
7012
7013                 /*
7014                  * If there was a backup label file, it's done its job and the info
7015                  * has now been propagated into pg_control.  We must get rid of the
7016                  * label file so that if we crash during recovery, we'll pick up at
7017                  * the latest recovery restartpoint instead of going all the way back
7018                  * to the backup start point.  It seems prudent though to just rename
7019                  * the file out of the way rather than delete it completely.
7020                  */
7021                 if (haveBackupLabel)
7022                 {
7023                         unlink(BACKUP_LABEL_OLD);
7024                         durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7025                 }
7026
7027                 /*
7028                  * If there was a tablespace_map file, it's done its job and the
7029                  * symlinks have been created.  We must get rid of the map file so
7030                  * that if we crash during recovery, we don't create symlinks again.
7031                  * It seems prudent though to just rename the file out of the way
7032                  * rather than delete it completely.
7033                  */
7034                 if (haveTblspcMap)
7035                 {
7036                         unlink(TABLESPACE_MAP_OLD);
7037                         durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7038                 }
7039
7040                 /* Check that the GUCs used to generate the WAL allow recovery */
7041                 CheckRequiredParameterValues();
7042
7043                 /*
7044                  * We're in recovery, so unlogged relations may be trashed and must be
7045                  * reset.  This should be done BEFORE allowing Hot Standby
7046                  * connections, so that read-only backends don't try to read whatever
7047                  * garbage is left over from before.
7048                  */
7049                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7050
7051                 /*
7052                  * Likewise, delete any saved transaction snapshot files that got left
7053                  * behind by crashed backends.
7054                  */
7055                 DeleteAllExportedSnapshotFiles();
7056
7057                 /*
7058                  * Initialize for Hot Standby, if enabled. We won't let backends in
7059                  * yet, not until we've reached the min recovery point specified in
7060                  * control file and we've established a recovery snapshot from a
7061                  * running-xacts WAL record.
7062                  */
7063                 if (ArchiveRecoveryRequested && EnableHotStandby)
7064                 {
7065                         TransactionId *xids;
7066                         int                     nxids;
7067
7068                         ereport(DEBUG1,
7069                                         (errmsg("initializing for hot standby")));
7070
7071                         InitRecoveryTransactionEnvironment();
7072
7073                         if (wasShutdown)
7074                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7075                         else
7076                                 oldestActiveXID = checkPoint.oldestActiveXid;
7077                         Assert(TransactionIdIsValid(oldestActiveXID));
7078
7079                         /* Tell procarray about the range of xids it has to deal with */
7080                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
7081
7082                         /*
7083                          * Startup commit log and subtrans only.  MultiXact and commit
7084                          * timestamp have already been started up and other SLRUs are not
7085                          * maintained during recovery and need not be started yet.
7086                          */
7087                         StartupCLOG();
7088                         StartupSUBTRANS(oldestActiveXID);
7089
7090                         /*
7091                          * If we're beginning at a shutdown checkpoint, we know that
7092                          * nothing was running on the master at this point. So fake-up an
7093                          * empty running-xacts record and use that here and now. Recover
7094                          * additional standby state for prepared transactions.
7095                          */
7096                         if (wasShutdown)
7097                         {
7098                                 RunningTransactionsData running;
7099                                 TransactionId latestCompletedXid;
7100
7101                                 /*
7102                                  * Construct a RunningTransactions snapshot representing a
7103                                  * shut down server, with only prepared transactions still
7104                                  * alive. We're never overflowed at this point because all
7105                                  * subxids are listed with their parent prepared transactions.
7106                                  */
7107                                 running.xcnt = nxids;
7108                                 running.subxcnt = 0;
7109                                 running.subxid_overflow = false;
7110                                 running.nextXid = checkPoint.nextXid;
7111                                 running.oldestRunningXid = oldestActiveXID;
7112                                 latestCompletedXid = checkPoint.nextXid;
7113                                 TransactionIdRetreat(latestCompletedXid);
7114                                 Assert(TransactionIdIsNormal(latestCompletedXid));
7115                                 running.latestCompletedXid = latestCompletedXid;
7116                                 running.xids = xids;
7117
7118                                 ProcArrayApplyRecoveryInfo(&running);
7119
7120                                 StandbyRecoverPreparedTransactions();
7121                         }
7122                 }
7123
7124                 /* Initialize resource managers */
7125                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7126                 {
7127                         if (RmgrTable[rmid].rm_startup != NULL)
7128                                 RmgrTable[rmid].rm_startup();
7129                 }
7130
7131                 /*
7132                  * Initialize shared variables for tracking progress of WAL replay, as
7133                  * if we had just replayed the record before the REDO location (or the
7134                  * checkpoint record itself, if it's a shutdown checkpoint).
7135                  */
7136                 SpinLockAcquire(&XLogCtl->info_lck);
7137                 if (checkPoint.redo < RecPtr)
7138                         XLogCtl->replayEndRecPtr = checkPoint.redo;
7139                 else
7140                         XLogCtl->replayEndRecPtr = EndRecPtr;
7141                 XLogCtl->replayEndTLI = ThisTimeLineID;
7142                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7143                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7144                 XLogCtl->recoveryLastXTime = 0;
7145                 XLogCtl->currentChunkStartTime = 0;
7146                 XLogCtl->recoveryPause = false;
7147                 SpinLockRelease(&XLogCtl->info_lck);
7148
7149                 /* Also ensure XLogReceiptTime has a sane value */
7150                 XLogReceiptTime = GetCurrentTimestamp();
7151
7152                 /*
7153                  * Let postmaster know we've started redo now, so that it can launch
7154                  * checkpointer to perform restartpoints.  We don't bother during
7155                  * crash recovery as restartpoints can only be performed during
7156                  * archive recovery.  And we'd like to keep crash recovery simple, to
7157                  * avoid introducing bugs that could affect you when recovering after
7158                  * crash.
7159                  *
7160                  * After this point, we can no longer assume that we're the only
7161                  * process in addition to postmaster!  Also, fsync requests are
7162                  * subsequently to be handled by the checkpointer, not locally.
7163                  */
7164                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7165                 {
7166                         PublishStartupProcessInformation();
7167                         SetForwardFsyncRequests();
7168                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7169                         bgwriterLaunched = true;
7170                 }
7171
7172                 /*
7173                  * Allow read-only connections immediately if we're consistent
7174                  * already.
7175                  */
7176                 CheckRecoveryConsistency();
7177
7178                 /*
7179                  * Find the first record that logically follows the checkpoint --- it
7180                  * might physically precede it, though.
7181                  */
7182                 if (checkPoint.redo < RecPtr)
7183                 {
7184                         /* back up to find the record */
7185                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7186                 }
7187                 else
7188                 {
7189                         /* just have to read next record after CheckPoint */
7190                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7191                 }
7192
7193                 if (record != NULL)
7194                 {
7195                         ErrorContextCallback errcallback;
7196                         TimestampTz xtime;
7197
7198                         InRedo = true;
7199
7200                         ereport(LOG,
7201                                         (errmsg("redo starts at %X/%X",
7202                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7203
7204                         /*
7205                          * main redo apply loop
7206                          */
7207                         do
7208                         {
7209                                 bool            switchedTLI = false;
7210
7211 #ifdef WAL_DEBUG
7212                                 if (XLOG_DEBUG ||
7213                                         (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7214                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7215                                 {
7216                                         StringInfoData buf;
7217
7218                                         initStringInfo(&buf);
7219                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7220                                                                          (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7221                                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7222                                         xlog_outrec(&buf, xlogreader);
7223                                         appendStringInfoString(&buf, " - ");
7224                                         xlog_outdesc(&buf, xlogreader);
7225                                         elog(LOG, "%s", buf.data);
7226                                         pfree(buf.data);
7227                                 }
7228 #endif
7229
7230                                 /* Handle interrupt signals of startup process */
7231                                 HandleStartupProcInterrupts();
7232
7233                                 /*
7234                                  * Pause WAL replay, if requested by a hot-standby session via
7235                                  * SetRecoveryPause().
7236                                  *
7237                                  * Note that we intentionally don't take the info_lck spinlock
7238                                  * here.  We might therefore read a slightly stale value of
7239                                  * the recoveryPause flag, but it can't be very stale (no
7240                                  * worse than the last spinlock we did acquire).  Since a
7241                                  * pause request is a pretty asynchronous thing anyway,
7242                                  * possibly responding to it one WAL record later than we
7243                                  * otherwise would is a minor issue, so it doesn't seem worth
7244                                  * adding another spinlock cycle to prevent that.
7245                                  */
7246                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7247                                         recoveryPausesHere();
7248
7249                                 /*
7250                                  * Have we reached our recovery target?
7251                                  */
7252                                 if (recoveryStopsBefore(xlogreader))
7253                                 {
7254                                         reachedStopPoint = true;        /* see below */
7255                                         break;
7256                                 }
7257
7258                                 /*
7259                                  * If we've been asked to lag the master, wait on latch until
7260                                  * enough time has passed.
7261                                  */
7262                                 if (recoveryApplyDelay(xlogreader))
7263                                 {
7264                                         /*
7265                                          * We test for paused recovery again here. If user sets
7266                                          * delayed apply, it may be because they expect to pause
7267                                          * recovery in case of problems, so we must test again
7268                                          * here otherwise pausing during the delay-wait wouldn't
7269                                          * work.
7270                                          */
7271                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7272                                                 recoveryPausesHere();
7273                                 }
7274
7275                                 /* Setup error traceback support for ereport() */
7276                                 errcallback.callback = rm_redo_error_callback;
7277                                 errcallback.arg = (void *) xlogreader;
7278                                 errcallback.previous = error_context_stack;
7279                                 error_context_stack = &errcallback;
7280
7281                                 /*
7282                                  * ShmemVariableCache->nextXid must be beyond record's xid.
7283                                  *
7284                                  * We don't expect anyone else to modify nextXid, hence we
7285                                  * don't need to hold a lock while examining it.  We still
7286                                  * acquire the lock to modify it, though.
7287                                  */
7288                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
7289                                                                                                  ShmemVariableCache->nextXid))
7290                                 {
7291                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7292                                         ShmemVariableCache->nextXid = record->xl_xid;
7293                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
7294                                         LWLockRelease(XidGenLock);
7295                                 }
7296
7297                                 /*
7298                                  * Before replaying this record, check if this record causes
7299                                  * the current timeline to change. The record is already
7300                                  * considered to be part of the new timeline, so we update
7301                                  * ThisTimeLineID before replaying it. That's important so
7302                                  * that replayEndTLI, which is recorded as the minimum
7303                                  * recovery point's TLI if recovery stops after this record,
7304                                  * is set correctly.
7305                                  */
7306                                 if (record->xl_rmid == RM_XLOG_ID)
7307                                 {
7308                                         TimeLineID      newTLI = ThisTimeLineID;
7309                                         TimeLineID      prevTLI = ThisTimeLineID;
7310                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7311
7312                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
7313                                         {
7314                                                 CheckPoint      checkPoint;
7315
7316                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7317                                                 newTLI = checkPoint.ThisTimeLineID;
7318                                                 prevTLI = checkPoint.PrevTimeLineID;
7319                                         }
7320                                         else if (info == XLOG_END_OF_RECOVERY)
7321                                         {
7322                                                 xl_end_of_recovery xlrec;
7323
7324                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7325                                                 newTLI = xlrec.ThisTimeLineID;
7326                                                 prevTLI = xlrec.PrevTimeLineID;
7327                                         }
7328
7329                                         if (newTLI != ThisTimeLineID)
7330                                         {
7331                                                 /* Check that it's OK to switch to this TLI */
7332                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7333
7334                                                 /* Following WAL records should be run with new TLI */
7335                                                 ThisTimeLineID = newTLI;
7336                                                 switchedTLI = true;
7337                                         }
7338                                 }
7339
7340                                 /*
7341                                  * Update shared replayEndRecPtr before replaying this record,
7342                                  * so that XLogFlush will update minRecoveryPoint correctly.
7343                                  */
7344                                 SpinLockAcquire(&XLogCtl->info_lck);
7345                                 XLogCtl->replayEndRecPtr = EndRecPtr;
7346                                 XLogCtl->replayEndTLI = ThisTimeLineID;
7347                                 SpinLockRelease(&XLogCtl->info_lck);
7348
7349                                 /*
7350                                  * If we are attempting to enter Hot Standby mode, process
7351                                  * XIDs we see
7352                                  */
7353                                 if (standbyState >= STANDBY_INITIALIZED &&
7354                                         TransactionIdIsValid(record->xl_xid))
7355                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7356
7357                                 /* Now apply the WAL record itself */
7358                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7359
7360                                 /*
7361                                  * After redo, check whether the backup pages associated with
7362                                  * the WAL record are consistent with the existing pages. This
7363                                  * check is done only if consistency check is enabled for this
7364                                  * record.
7365                                  */
7366                                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7367                                         checkXLogConsistency(xlogreader);
7368
7369                                 /* Pop the error context stack */
7370                                 error_context_stack = errcallback.previous;
7371
7372                                 /*
7373                                  * Update lastReplayedEndRecPtr after this record has been
7374                                  * successfully replayed.
7375                                  */
7376                                 SpinLockAcquire(&XLogCtl->info_lck);
7377                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7378                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7379                                 SpinLockRelease(&XLogCtl->info_lck);
7380
7381                                 /*
7382                                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
7383                                  * up the receiver so that it notices the updated
7384                                  * lastReplayedEndRecPtr and sends a reply to the master.
7385                                  */
7386                                 if (doRequestWalReceiverReply)
7387                                 {
7388                                         doRequestWalReceiverReply = false;
7389                                         WalRcvForceReply();
7390                                 }
7391
7392                                 /* Remember this record as the last-applied one */
7393                                 LastRec = ReadRecPtr;
7394
7395                                 /* Allow read-only connections if we're consistent now */
7396                                 CheckRecoveryConsistency();
7397
7398                                 /* Is this a timeline switch? */
7399                                 if (switchedTLI)
7400                                 {
7401                                         /*
7402                                          * Before we continue on the new timeline, clean up any
7403                                          * (possibly bogus) future WAL segments on the old
7404                                          * timeline.
7405                                          */
7406                                         RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7407
7408                                         /*
7409                                          * Wake up any walsenders to notice that we are on a new
7410                                          * timeline.
7411                                          */
7412                                         if (switchedTLI && AllowCascadeReplication())
7413                                                 WalSndWakeup();
7414                                 }
7415
7416                                 /* Exit loop if we reached inclusive recovery target */
7417                                 if (recoveryStopsAfter(xlogreader))
7418                                 {
7419                                         reachedStopPoint = true;
7420                                         break;
7421                                 }
7422
7423                                 /* Else, try to fetch the next WAL record */
7424                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7425                         } while (record != NULL);
7426
7427                         /*
7428                          * end of main redo apply loop
7429                          */
7430
7431                         if (reachedStopPoint)
7432                         {
7433                                 if (!reachedConsistency)
7434                                         ereport(FATAL,
7435                                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7436
7437                                 /*
7438                                  * This is the last point where we can restart recovery with a
7439                                  * new recovery target, if we shutdown and begin again. After
7440                                  * this, Resource Managers may choose to do permanent
7441                                  * corrective actions at end of recovery.
7442                                  */
7443                                 switch (recoveryTargetAction)
7444                                 {
7445                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
7446
7447                                                 /*
7448                                                  * exit with special return code to request shutdown
7449                                                  * of postmaster.  Log messages issued from
7450                                                  * postmaster.
7451                                                  */
7452                                                 proc_exit(3);
7453
7454                                         case RECOVERY_TARGET_ACTION_PAUSE:
7455                                                 SetRecoveryPause(true);
7456                                                 recoveryPausesHere();
7457
7458                                                 /* drop into promote */
7459
7460                                         case RECOVERY_TARGET_ACTION_PROMOTE:
7461                                                 break;
7462                                 }
7463                         }
7464
7465                         /* Allow resource managers to do any required cleanup. */
7466                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7467                         {
7468                                 if (RmgrTable[rmid].rm_cleanup != NULL)
7469                                         RmgrTable[rmid].rm_cleanup();
7470                         }
7471
7472                         ereport(LOG,
7473                                         (errmsg("redo done at %X/%X",
7474                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7475                         xtime = GetLatestXTime();
7476                         if (xtime)
7477                                 ereport(LOG,
7478                                                 (errmsg("last completed transaction was at log time %s",
7479                                                                 timestamptz_to_str(xtime))));
7480
7481                         InRedo = false;
7482                 }
7483                 else
7484                 {
7485                         /* there are no WAL records following the checkpoint */
7486                         ereport(LOG,
7487                                         (errmsg("redo is not required")));
7488                 }
7489         }
7490
7491         /*
7492          * Kill WAL receiver, if it's still running, before we continue to write
7493          * the startup checkpoint record. It will trump over the checkpoint and
7494          * subsequent records if it's still alive when we start writing WAL.
7495          */
7496         ShutdownWalRcv();
7497
7498         /*
7499          * Reset unlogged relations to the contents of their INIT fork. This is
7500          * done AFTER recovery is complete so as to include any unlogged relations
7501          * created during recovery, but BEFORE recovery is marked as having
7502          * completed successfully. Otherwise we'd not retry if any of the post
7503          * end-of-recovery steps fail.
7504          */
7505         if (InRecovery)
7506                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7507
7508         /*
7509          * We don't need the latch anymore. It's not strictly necessary to disown
7510          * it, but let's do it for the sake of tidiness.
7511          */
7512         if (StandbyModeRequested)
7513                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7514
7515         /*
7516          * We are now done reading the xlog from stream. Turn off streaming
7517          * recovery to force fetching the files (which would be required at end of
7518          * recovery, e.g., timeline history file) from archive or pg_wal.
7519          */
7520         StandbyMode = false;
7521
7522         /*
7523          * Re-fetch the last valid or last applied record, so we can identify the
7524          * exact endpoint of what we consider the valid portion of WAL.
7525          */
7526         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7527         EndOfLog = EndRecPtr;
7528
7529         /*
7530          * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7531          * the end-of-log. It could be different from the timeline that EndOfLog
7532          * nominally belongs to, if there was a timeline switch in that segment,
7533          * and we were reading the old WAL from a segment belonging to a higher
7534          * timeline.
7535          */
7536         EndOfLogTLI = xlogreader->readPageTLI;
7537
7538         /*
7539          * Complain if we did not roll forward far enough to render the backup
7540          * dump consistent.  Note: it is indeed okay to look at the local variable
7541          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7542          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7543          * advanced beyond the WAL we processed.
7544          */
7545         if (InRecovery &&
7546                 (EndOfLog < minRecoveryPoint ||
7547                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7548         {
7549                 /*
7550                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7551                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7552                  * tried to recover from an online backup but never called
7553                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7554                  * point. However, this also happens in crash recovery, if the system
7555                  * crashes while an online backup is in progress. We must not treat
7556                  * that as an error, or the database will refuse to start up.
7557                  */
7558                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7559                 {
7560                         if (ControlFile->backupEndRequired)
7561                                 ereport(FATAL,
7562                                                 (errmsg("WAL ends before end of online backup"),
7563                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7564                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7565                                 ereport(FATAL,
7566                                                 (errmsg("WAL ends before end of online backup"),
7567                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7568                         else
7569                                 ereport(FATAL,
7570                                                 (errmsg("WAL ends before consistent recovery point")));
7571                 }
7572         }
7573
7574         /*
7575          * Pre-scan prepared transactions to find out the range of XIDs present.
7576          * This information is not quite needed yet, but it is positioned here so
7577          * as potential problems are detected before any on-disk change is done.
7578          */
7579         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7580
7581         /*
7582          * Consider whether we need to assign a new timeline ID.
7583          *
7584          * If we are doing an archive recovery, we always assign a new ID.  This
7585          * handles a couple of issues.  If we stopped short of the end of WAL
7586          * during recovery, then we are clearly generating a new timeline and must
7587          * assign it a unique new ID.  Even if we ran to the end, modifying the
7588          * current last segment is problematic because it may result in trying to
7589          * overwrite an already-archived copy of that segment, and we encourage
7590          * DBAs to make their archive_commands reject that.  We can dodge the
7591          * problem by making the new active segment have a new timeline ID.
7592          *
7593          * In a normal crash recovery, we can just extend the timeline we were in.
7594          */
7595         PrevTimeLineID = ThisTimeLineID;
7596         if (ArchiveRecoveryRequested)
7597         {
7598                 char            reason[200];
7599
7600                 Assert(InArchiveRecovery);
7601
7602                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7603                 ereport(LOG,
7604                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7605
7606                 /*
7607                  * Create a comment for the history file to explain why and where
7608                  * timeline changed.
7609                  */
7610                 if (recoveryTarget == RECOVERY_TARGET_XID)
7611                         snprintf(reason, sizeof(reason),
7612                                          "%s transaction %u",
7613                                          recoveryStopAfter ? "after" : "before",
7614                                          recoveryStopXid);
7615                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7616                         snprintf(reason, sizeof(reason),
7617                                          "%s %s\n",
7618                                          recoveryStopAfter ? "after" : "before",
7619                                          timestamptz_to_str(recoveryStopTime));
7620                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7621                         snprintf(reason, sizeof(reason),
7622                                          "%s LSN %X/%X\n",
7623                                          recoveryStopAfter ? "after" : "before",
7624                                          (uint32) (recoveryStopLSN >> 32),
7625                                          (uint32) recoveryStopLSN);
7626                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7627                         snprintf(reason, sizeof(reason),
7628                                          "at restore point \"%s\"",
7629                                          recoveryStopName);
7630                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7631                         snprintf(reason, sizeof(reason), "reached consistency");
7632                 else
7633                         snprintf(reason, sizeof(reason), "no recovery target specified");
7634
7635                 /*
7636                  * We are now done reading the old WAL.  Turn off archive fetching if
7637                  * it was active, and make a writable copy of the last WAL segment.
7638                  * (Note that we also have a copy of the last block of the old WAL in
7639                  * readBuf; we will use that below.)
7640                  */
7641                 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7642
7643                 /*
7644                  * Write the timeline history file, and have it archived. After this
7645                  * point (or rather, as soon as the file is archived), the timeline
7646                  * will appear as "taken" in the WAL archive and to any standby
7647                  * servers.  If we crash before actually switching to the new
7648                  * timeline, standby servers will nevertheless think that we switched
7649                  * to the new timeline, and will try to connect to the new timeline.
7650                  * To minimize the window for that, try to do as little as possible
7651                  * between here and writing the end-of-recovery record.
7652                  */
7653                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7654                                                          EndRecPtr, reason);
7655         }
7656
7657         /* Save the selected TimeLineID in shared memory, too */
7658         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7659         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7660
7661         /*
7662          * Prepare to write WAL starting at EndOfLog location, and init xlog
7663          * buffer cache using the block containing the last record from the
7664          * previous incarnation.
7665          */
7666         Insert = &XLogCtl->Insert;
7667         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7668         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7669
7670         /*
7671          * Tricky point here: readBuf contains the *last* block that the LastRec
7672          * record spans, not the one it starts in.  The last block is indeed the
7673          * one we want to use.
7674          */
7675         if (EndOfLog % XLOG_BLCKSZ != 0)
7676         {
7677                 char       *page;
7678                 int                     len;
7679                 int                     firstIdx;
7680                 XLogRecPtr      pageBeginPtr;
7681
7682                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7683                 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7684
7685                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7686
7687                 /* Copy the valid part of the last block, and zero the rest */
7688                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7689                 len = EndOfLog % XLOG_BLCKSZ;
7690                 memcpy(page, xlogreader->readBuf, len);
7691                 memset(page + len, 0, XLOG_BLCKSZ - len);
7692
7693                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7694                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7695         }
7696         else
7697         {
7698                 /*
7699                  * There is no partial block to copy. Just set InitializedUpTo, and
7700                  * let the first attempt to insert a log record to initialize the next
7701                  * buffer.
7702                  */
7703                 XLogCtl->InitializedUpTo = EndOfLog;
7704         }
7705
7706         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7707
7708         XLogCtl->LogwrtResult = LogwrtResult;
7709
7710         XLogCtl->LogwrtRqst.Write = EndOfLog;
7711         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7712
7713         /*
7714          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7715          * record before resource manager writes cleanup WAL records or checkpoint
7716          * record is written.
7717          */
7718         Insert->fullPageWrites = lastFullPageWrites;
7719         LocalSetXLogInsertAllowed();
7720         UpdateFullPageWrites();
7721         LocalXLogInsertAllowed = -1;
7722
7723         if (InRecovery)
7724         {
7725                 /*
7726                  * Perform a checkpoint to update all our recovery activity to disk.
7727                  *
7728                  * Note that we write a shutdown checkpoint rather than an on-line
7729                  * one. This is not particularly critical, but since we may be
7730                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7731                  * the rule that TLI only changes in shutdown checkpoints, which
7732                  * allows some extra error checking in xlog_redo.
7733                  *
7734                  * In fast promotion, only create a lightweight end-of-recovery record
7735                  * instead of a full checkpoint. A checkpoint is requested later,
7736                  * after we're fully out of recovery mode and already accepting
7737                  * queries.
7738                  */
7739                 if (bgwriterLaunched)
7740                 {
7741                         if (fast_promote)
7742                         {
7743                                 checkPointLoc = ControlFile->checkPoint;
7744
7745                                 /*
7746                                  * Confirm the last checkpoint is available for us to recover
7747                                  * from if we fail.
7748                                  */
7749                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7750                                 if (record != NULL)
7751                                 {
7752                                         fast_promoted = true;
7753
7754                                         /*
7755                                          * Insert a special WAL record to mark the end of
7756                                          * recovery, since we aren't doing a checkpoint. That
7757                                          * means that the checkpointer process may likely be in
7758                                          * the middle of a time-smoothed restartpoint and could
7759                                          * continue to be for minutes after this. That sounds
7760                                          * strange, but the effect is roughly the same and it
7761                                          * would be stranger to try to come out of the
7762                                          * restartpoint and then checkpoint. We request a
7763                                          * checkpoint later anyway, just for safety.
7764                                          */
7765                                         CreateEndOfRecoveryRecord();
7766                                 }
7767                         }
7768
7769                         if (!fast_promoted)
7770                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7771                                                                   CHECKPOINT_IMMEDIATE |
7772                                                                   CHECKPOINT_WAIT);
7773                 }
7774                 else
7775                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7776
7777                 /*
7778                  * And finally, execute the recovery_end_command, if any.
7779                  */
7780                 if (recoveryEndCommand)
7781                         ExecuteRecoveryCommand(recoveryEndCommand,
7782                                                                    "recovery_end_command",
7783                                                                    true);
7784         }
7785
7786         if (ArchiveRecoveryRequested)
7787         {
7788                 /*
7789                  * We switched to a new timeline. Clean up segments on the old
7790                  * timeline.
7791                  *
7792                  * If there are any higher-numbered segments on the old timeline,
7793                  * remove them. They might contain valid WAL, but they might also be
7794                  * pre-allocated files containing garbage. In any case, they are not
7795                  * part of the new timeline's history so we don't need them.
7796                  */
7797                 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7798
7799                 /*
7800                  * If the switch happened in the middle of a segment, what to do with
7801                  * the last, partial segment on the old timeline? If we don't archive
7802                  * it, and the server that created the WAL never archives it either
7803                  * (e.g. because it was hit by a meteor), it will never make it to the
7804                  * archive. That's OK from our point of view, because the new segment
7805                  * that we created with the new TLI contains all the WAL from the old
7806                  * timeline up to the switch point. But if you later try to do PITR to
7807                  * the "missing" WAL on the old timeline, recovery won't find it in
7808                  * the archive. It's physically present in the new file with new TLI,
7809                  * but recovery won't look there when it's recovering to the older
7810                  * timeline. On the other hand, if we archive the partial segment, and
7811                  * the original server on that timeline is still running and archives
7812                  * the completed version of the same segment later, it will fail. (We
7813                  * used to do that in 9.4 and below, and it caused such problems).
7814                  *
7815                  * As a compromise, we rename the last segment with the .partial
7816                  * suffix, and archive it. Archive recovery will never try to read
7817                  * .partial segments, so they will normally go unused. But in the odd
7818                  * PITR case, the administrator can copy them manually to the pg_wal
7819                  * directory (removing the suffix). They can be useful in debugging,
7820                  * too.
7821                  *
7822                  * If a .done or .ready file already exists for the old timeline,
7823                  * however, we had already determined that the segment is complete, so
7824                  * we can let it be archived normally. (In particular, if it was
7825                  * restored from the archive to begin with, it's expected to have a
7826                  * .done file).
7827                  */
7828                 if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7829                         XLogArchivingActive())
7830                 {
7831                         char            origfname[MAXFNAMELEN];
7832                         XLogSegNo       endLogSegNo;
7833
7834                         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7835                         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7836
7837                         if (!XLogArchiveIsReadyOrDone(origfname))
7838                         {
7839                                 char            origpath[MAXPGPATH];
7840                                 char            partialfname[MAXFNAMELEN];
7841                                 char            partialpath[MAXPGPATH];
7842
7843                                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7844                                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7845                                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7846
7847                                 /*
7848                                  * Make sure there's no .done or .ready file for the .partial
7849                                  * file.
7850                                  */
7851                                 XLogArchiveCleanup(partialfname);
7852
7853                                 durable_rename(origpath, partialpath, ERROR);
7854                                 XLogArchiveNotify(partialfname);
7855                         }
7856                 }
7857         }
7858
7859         /*
7860          * Preallocate additional log files, if wanted.
7861          */
7862         PreallocXlogFiles(EndOfLog);
7863
7864         /*
7865          * Okay, we're officially UP.
7866          */
7867         InRecovery = false;
7868
7869         /* start the archive_timeout timer and LSN running */
7870         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7871         XLogCtl->lastSegSwitchLSN = EndOfLog;
7872
7873         /* also initialize latestCompletedXid, to nextXid - 1 */
7874         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7875         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7876         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7877         LWLockRelease(ProcArrayLock);
7878
7879         /*
7880          * Start up the commit log and subtrans, if not already done for hot
7881          * standby.  (commit timestamps are started below, if necessary.)
7882          */
7883         if (standbyState == STANDBY_DISABLED)
7884         {
7885                 StartupCLOG();
7886                 StartupSUBTRANS(oldestActiveXID);
7887         }
7888
7889         /*
7890          * Perform end of recovery actions for any SLRUs that need it.
7891          */
7892         TrimCLOG();
7893         TrimMultiXact();
7894
7895         /* Reload shared-memory state for prepared transactions */
7896         RecoverPreparedTransactions();
7897
7898         /*
7899          * Shutdown the recovery environment. This must occur after
7900          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7901          */
7902         if (standbyState != STANDBY_DISABLED)
7903                 ShutdownRecoveryTransactionEnvironment();
7904
7905         /* Shut down xlogreader */
7906         if (readFile >= 0)
7907         {
7908                 close(readFile);
7909                 readFile = -1;
7910         }
7911         XLogReaderFree(xlogreader);
7912
7913         /*
7914          * If any of the critical GUCs have changed, log them before we allow
7915          * backends to write WAL.
7916          */
7917         LocalSetXLogInsertAllowed();
7918         XLogReportParameters();
7919
7920         /*
7921          * Local WAL inserts enabled, so it's time to finish initialization of
7922          * commit timestamp.
7923          */
7924         CompleteCommitTsInitialization();
7925
7926         /*
7927          * All done with end-of-recovery actions.
7928          *
7929          * Now allow backends to write WAL and update the control file status in
7930          * consequence.  The boolean flag allowing backends to write WAL is
7931          * updated while holding ControlFileLock to prevent other backends to look
7932          * at an inconsistent state of the control file in shared memory.  There
7933          * is still a small window during which backends can write WAL and the
7934          * control file is still referring to a system not in DB_IN_PRODUCTION
7935          * state while looking at the on-disk control file.
7936          *
7937          * Also, although the boolean flag to allow WAL is probably atomic in
7938          * itself, we use the info_lck here to ensure that there are no race
7939          * conditions concerning visibility of other recent updates to shared
7940          * memory.
7941          */
7942         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7943         ControlFile->state = DB_IN_PRODUCTION;
7944         ControlFile->time = (pg_time_t) time(NULL);
7945
7946         SpinLockAcquire(&XLogCtl->info_lck);
7947         XLogCtl->SharedRecoveryInProgress = false;
7948         SpinLockRelease(&XLogCtl->info_lck);
7949
7950         UpdateControlFile();
7951         LWLockRelease(ControlFileLock);
7952
7953         /*
7954          * If there were cascading standby servers connected to us, nudge any wal
7955          * sender processes to notice that we've been promoted.
7956          */
7957         WalSndWakeup();
7958
7959         /*
7960          * If this was a fast promotion, request an (online) checkpoint now. This
7961          * isn't required for consistency, but the last restartpoint might be far
7962          * back, and in case of a crash, recovering from it might take a longer
7963          * than is appropriate now that we're not in standby mode anymore.
7964          */
7965         if (fast_promoted)
7966                 RequestCheckpoint(CHECKPOINT_FORCE);
7967 }
7968
7969 /*
7970  * Checks if recovery has reached a consistent state. When consistency is
7971  * reached and we have a valid starting standby snapshot, tell postmaster
7972  * that it can start accepting read-only connections.
7973  */
7974 static void
7975 CheckRecoveryConsistency(void)
7976 {
7977         XLogRecPtr      lastReplayedEndRecPtr;
7978
7979         /*
7980          * During crash recovery, we don't reach a consistent state until we've
7981          * replayed all the WAL.
7982          */
7983         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7984                 return;
7985
7986         Assert(InArchiveRecovery);
7987
7988         /*
7989          * assume that we are called in the startup process, and hence don't need
7990          * a lock to read lastReplayedEndRecPtr
7991          */
7992         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7993
7994         /*
7995          * Have we reached the point where our base backup was completed?
7996          */
7997         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7998                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7999         {
8000                 /*
8001                  * We have reached the end of base backup, as indicated by pg_control.
8002                  * The data on disk is now consistent. Reset backupStartPoint and
8003                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
8004                  * allow starting up at an earlier point even if recovery is stopped
8005                  * and restarted soon after this.
8006                  */
8007                 elog(DEBUG1, "end of backup reached");
8008
8009                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8010
8011                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8012                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8013
8014                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
8015                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
8016                 ControlFile->backupEndRequired = false;
8017                 UpdateControlFile();
8018
8019                 LWLockRelease(ControlFileLock);
8020         }
8021
8022         /*
8023          * Have we passed our safe starting point? Note that minRecoveryPoint is
8024          * known to be incorrectly set if ControlFile->backupEndRequired, until
8025          * the XLOG_BACKUP_END arrives to advise us of the correct
8026          * minRecoveryPoint. All we know prior to that is that we're not
8027          * consistent yet.
8028          */
8029         if (!reachedConsistency && !ControlFile->backupEndRequired &&
8030                 minRecoveryPoint <= lastReplayedEndRecPtr &&
8031                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8032         {
8033                 /*
8034                  * Check to see if the XLOG sequence contained any unresolved
8035                  * references to uninitialized pages.
8036                  */
8037                 XLogCheckInvalidPages();
8038
8039                 reachedConsistency = true;
8040                 ereport(LOG,
8041                                 (errmsg("consistent recovery state reached at %X/%X",
8042                                                 (uint32) (lastReplayedEndRecPtr >> 32),
8043                                                 (uint32) lastReplayedEndRecPtr)));
8044         }
8045
8046         /*
8047          * Have we got a valid starting snapshot that will allow queries to be
8048          * run? If so, we can tell postmaster that the database is consistent now,
8049          * enabling connections.
8050          */
8051         if (standbyState == STANDBY_SNAPSHOT_READY &&
8052                 !LocalHotStandbyActive &&
8053                 reachedConsistency &&
8054                 IsUnderPostmaster)
8055         {
8056                 SpinLockAcquire(&XLogCtl->info_lck);
8057                 XLogCtl->SharedHotStandbyActive = true;
8058                 SpinLockRelease(&XLogCtl->info_lck);
8059
8060                 LocalHotStandbyActive = true;
8061
8062                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8063         }
8064 }
8065
8066 /*
8067  * Is the system still in recovery?
8068  *
8069  * Unlike testing InRecovery, this works in any process that's connected to
8070  * shared memory.
8071  *
8072  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8073  * variables the first time we see that recovery is finished.
8074  */
8075 bool
8076 RecoveryInProgress(void)
8077 {
8078         /*
8079          * We check shared state each time only until we leave recovery mode. We
8080          * can't re-enter recovery, so there's no need to keep checking after the
8081          * shared variable has once been seen false.
8082          */
8083         if (!LocalRecoveryInProgress)
8084                 return false;
8085         else
8086         {
8087                 /*
8088                  * use volatile pointer to make sure we make a fresh read of the
8089                  * shared variable.
8090                  */
8091                 volatile XLogCtlData *xlogctl = XLogCtl;
8092
8093                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
8094
8095                 /*
8096                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8097                  * is finished. InitPostgres() relies upon this behaviour to ensure
8098                  * that InitXLOGAccess() is called at backend startup.  (If you change
8099                  * this, see also LocalSetXLogInsertAllowed.)
8100                  */
8101                 if (!LocalRecoveryInProgress)
8102                 {
8103                         /*
8104                          * If we just exited recovery, make sure we read TimeLineID and
8105                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
8106                          * weak memory ordering).
8107                          */
8108                         pg_memory_barrier();
8109                         InitXLOGAccess();
8110                 }
8111
8112                 /*
8113                  * Note: We don't need a memory barrier when we're still in recovery.
8114                  * We might exit recovery immediately after return, so the caller
8115                  * can't rely on 'true' meaning that we're still in recovery anyway.
8116                  */
8117
8118                 return LocalRecoveryInProgress;
8119         }
8120 }
8121
8122 /*
8123  * Is HotStandby active yet? This is only important in special backends
8124  * since normal backends won't ever be able to connect until this returns
8125  * true. Postmaster knows this by way of signal, not via shared memory.
8126  *
8127  * Unlike testing standbyState, this works in any process that's connected to
8128  * shared memory.  (And note that standbyState alone doesn't tell the truth
8129  * anyway.)
8130  */
8131 bool
8132 HotStandbyActive(void)
8133 {
8134         /*
8135          * We check shared state each time only until Hot Standby is active. We
8136          * can't de-activate Hot Standby, so there's no need to keep checking
8137          * after the shared variable has once been seen true.
8138          */
8139         if (LocalHotStandbyActive)
8140                 return true;
8141         else
8142         {
8143                 /* spinlock is essential on machines with weak memory ordering! */
8144                 SpinLockAcquire(&XLogCtl->info_lck);
8145                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8146                 SpinLockRelease(&XLogCtl->info_lck);
8147
8148                 return LocalHotStandbyActive;
8149         }
8150 }
8151
8152 /*
8153  * Like HotStandbyActive(), but to be used only in WAL replay code,
8154  * where we don't need to ask any other process what the state is.
8155  */
8156 bool
8157 HotStandbyActiveInReplay(void)
8158 {
8159         Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8160         return LocalHotStandbyActive;
8161 }
8162
8163 /*
8164  * Is this process allowed to insert new WAL records?
8165  *
8166  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8167  * But we also have provisions for forcing the result "true" or "false"
8168  * within specific processes regardless of the global state.
8169  */
8170 bool
8171 XLogInsertAllowed(void)
8172 {
8173         /*
8174          * If value is "unconditionally true" or "unconditionally false", just
8175          * return it.  This provides the normal fast path once recovery is known
8176          * done.
8177          */
8178         if (LocalXLogInsertAllowed >= 0)
8179                 return (bool) LocalXLogInsertAllowed;
8180
8181         /*
8182          * Else, must check to see if we're still in recovery.
8183          */
8184         if (RecoveryInProgress())
8185                 return false;
8186
8187         /*
8188          * On exit from recovery, reset to "unconditionally true", since there is
8189          * no need to keep checking.
8190          */
8191         LocalXLogInsertAllowed = 1;
8192         return true;
8193 }
8194
8195 /*
8196  * Make XLogInsertAllowed() return true in the current process only.
8197  *
8198  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8199  * and even call LocalSetXLogInsertAllowed() again after that.
8200  */
8201 static void
8202 LocalSetXLogInsertAllowed(void)
8203 {
8204         Assert(LocalXLogInsertAllowed == -1);
8205         LocalXLogInsertAllowed = 1;
8206
8207         /* Initialize as RecoveryInProgress() would do when switching state */
8208         InitXLOGAccess();
8209 }
8210
8211 /*
8212  * Subroutine to try to fetch and validate a prior checkpoint record.
8213  *
8214  * whichChkpt identifies the checkpoint (merely for reporting purposes).
8215  * 1 for "primary", 0 for "other" (backup_label)
8216  */
8217 static XLogRecord *
8218 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8219                                          int whichChkpt, bool report)
8220 {
8221         XLogRecord *record;
8222         uint8           info;
8223
8224         if (!XRecOffIsValid(RecPtr))
8225         {
8226                 if (!report)
8227                         return NULL;
8228
8229                 switch (whichChkpt)
8230                 {
8231                         case 1:
8232                                 ereport(LOG,
8233                                                 (errmsg("invalid primary checkpoint link in control file")));
8234                                 break;
8235                         default:
8236                                 ereport(LOG,
8237                                                 (errmsg("invalid checkpoint link in backup_label file")));
8238                                 break;
8239                 }
8240                 return NULL;
8241         }
8242
8243         record = ReadRecord(xlogreader, RecPtr, LOG, true);
8244
8245         if (record == NULL)
8246         {
8247                 if (!report)
8248                         return NULL;
8249
8250                 switch (whichChkpt)
8251                 {
8252                         case 1:
8253                                 ereport(LOG,
8254                                                 (errmsg("invalid primary checkpoint record")));
8255                                 break;
8256                         default:
8257                                 ereport(LOG,
8258                                                 (errmsg("invalid checkpoint record")));
8259                                 break;
8260                 }
8261                 return NULL;
8262         }
8263         if (record->xl_rmid != RM_XLOG_ID)
8264         {
8265                 switch (whichChkpt)
8266                 {
8267                         case 1:
8268                                 ereport(LOG,
8269                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
8270                                 break;
8271                         default:
8272                                 ereport(LOG,
8273                                                 (errmsg("invalid resource manager ID in checkpoint record")));
8274                                 break;
8275                 }
8276                 return NULL;
8277         }
8278         info = record->xl_info & ~XLR_INFO_MASK;
8279         if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8280                 info != XLOG_CHECKPOINT_ONLINE)
8281         {
8282                 switch (whichChkpt)
8283                 {
8284                         case 1:
8285                                 ereport(LOG,
8286                                                 (errmsg("invalid xl_info in primary checkpoint record")));
8287                                 break;
8288                         default:
8289                                 ereport(LOG,
8290                                                 (errmsg("invalid xl_info in checkpoint record")));
8291                                 break;
8292                 }
8293                 return NULL;
8294         }
8295         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8296         {
8297                 switch (whichChkpt)
8298                 {
8299                         case 1:
8300                                 ereport(LOG,
8301                                                 (errmsg("invalid length of primary checkpoint record")));
8302                                 break;
8303                         default:
8304                                 ereport(LOG,
8305                                                 (errmsg("invalid length of checkpoint record")));
8306                                 break;
8307                 }
8308                 return NULL;
8309         }
8310         return record;
8311 }
8312
8313 /*
8314  * This must be called in a backend process before creating WAL records
8315  * (except in a standalone backend, which does StartupXLOG instead).  We need
8316  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8317  *
8318  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8319  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8320  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8321  */
8322 void
8323 InitXLOGAccess(void)
8324 {
8325         XLogCtlInsert *Insert = &XLogCtl->Insert;
8326
8327         /* ThisTimeLineID doesn't change so we need no lock to copy it */
8328         ThisTimeLineID = XLogCtl->ThisTimeLineID;
8329         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8330
8331         /* set wal_segment_size */
8332         wal_segment_size = ControlFile->xlog_seg_size;
8333
8334         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8335         (void) GetRedoRecPtr();
8336         /* Also update our copy of doPageWrites. */
8337         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8338
8339         /* Also initialize the working areas for constructing WAL records */
8340         InitXLogInsert();
8341 }
8342
8343 /*
8344  * Return the current Redo pointer from shared memory.
8345  *
8346  * As a side-effect, the local RedoRecPtr copy is updated.
8347  */
8348 XLogRecPtr
8349 GetRedoRecPtr(void)
8350 {
8351         XLogRecPtr      ptr;
8352
8353         /*
8354          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8355          * grabbed a WAL insertion lock to read the master copy, someone might
8356          * update it just after we've released the lock.
8357          */
8358         SpinLockAcquire(&XLogCtl->info_lck);
8359         ptr = XLogCtl->RedoRecPtr;
8360         SpinLockRelease(&XLogCtl->info_lck);
8361
8362         if (RedoRecPtr < ptr)
8363                 RedoRecPtr = ptr;
8364
8365         return RedoRecPtr;
8366 }
8367
8368 /*
8369  * Return information needed to decide whether a modified block needs a
8370  * full-page image to be included in the WAL record.
8371  *
8372  * The returned values are cached copies from backend-private memory, and
8373  * possibly out-of-date.  XLogInsertRecord will re-check them against
8374  * up-to-date values, while holding the WAL insert lock.
8375  */
8376 void
8377 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8378 {
8379         *RedoRecPtr_p = RedoRecPtr;
8380         *doPageWrites_p = doPageWrites;
8381 }
8382
8383 /*
8384  * GetInsertRecPtr -- Returns the current insert position.
8385  *
8386  * NOTE: The value *actually* returned is the position of the last full
8387  * xlog page. It lags behind the real insert position by at most 1 page.
8388  * For that, we don't need to scan through WAL insertion locks, and an
8389  * approximation is enough for the current usage of this function.
8390  */
8391 XLogRecPtr
8392 GetInsertRecPtr(void)
8393 {
8394         XLogRecPtr      recptr;
8395
8396         SpinLockAcquire(&XLogCtl->info_lck);
8397         recptr = XLogCtl->LogwrtRqst.Write;
8398         SpinLockRelease(&XLogCtl->info_lck);
8399
8400         return recptr;
8401 }
8402
8403 /*
8404  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8405  * position known to be fsync'd to disk.
8406  */
8407 XLogRecPtr
8408 GetFlushRecPtr(void)
8409 {
8410         SpinLockAcquire(&XLogCtl->info_lck);
8411         LogwrtResult = XLogCtl->LogwrtResult;
8412         SpinLockRelease(&XLogCtl->info_lck);
8413
8414         return LogwrtResult.Flush;
8415 }
8416
8417 /*
8418  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8419  * inserted. All records not explicitly marked as unimportant are considered
8420  * important.
8421  *
8422  * The LSN is determined by computing the maximum of
8423  * WALInsertLocks[i].lastImportantAt.
8424  */
8425 XLogRecPtr
8426 GetLastImportantRecPtr(void)
8427 {
8428         XLogRecPtr      res = InvalidXLogRecPtr;
8429         int                     i;
8430
8431         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8432         {
8433                 XLogRecPtr      last_important;
8434
8435                 /*
8436                  * Need to take a lock to prevent torn reads of the LSN, which are
8437                  * possible on some of the supported platforms. WAL insert locks only
8438                  * support exclusive mode, so we have to use that.
8439                  */
8440                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8441                 last_important = WALInsertLocks[i].l.lastImportantAt;
8442                 LWLockRelease(&WALInsertLocks[i].l.lock);
8443
8444                 if (res < last_important)
8445                         res = last_important;
8446         }
8447
8448         return res;
8449 }
8450
8451 /*
8452  * Get the time and LSN of the last xlog segment switch
8453  */
8454 pg_time_t
8455 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8456 {
8457         pg_time_t       result;
8458
8459         /* Need WALWriteLock, but shared lock is sufficient */
8460         LWLockAcquire(WALWriteLock, LW_SHARED);
8461         result = XLogCtl->lastSegSwitchTime;
8462         *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8463         LWLockRelease(WALWriteLock);
8464
8465         return result;
8466 }
8467
8468 /*
8469  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8470  *
8471  * This is exported for use by code that would like to have 64-bit XIDs.
8472  * We don't really support such things, but all XIDs within the system
8473  * can be presumed "close to" the result, and thus the epoch associated
8474  * with them can be determined.
8475  */
8476 void
8477 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8478 {
8479         uint32          ckptXidEpoch;
8480         TransactionId ckptXid;
8481         TransactionId nextXid;
8482
8483         /* Must read checkpoint info first, else have race condition */
8484         SpinLockAcquire(&XLogCtl->info_lck);
8485         ckptXidEpoch = XLogCtl->ckptXidEpoch;
8486         ckptXid = XLogCtl->ckptXid;
8487         SpinLockRelease(&XLogCtl->info_lck);
8488
8489         /* Now fetch current nextXid */
8490         nextXid = ReadNewTransactionId();
8491
8492         /*
8493          * nextXid is certainly logically later than ckptXid.  So if it's
8494          * numerically less, it must have wrapped into the next epoch.
8495          */
8496         if (nextXid < ckptXid)
8497                 ckptXidEpoch++;
8498
8499         *xid = nextXid;
8500         *epoch = ckptXidEpoch;
8501 }
8502
8503 /*
8504  * This must be called ONCE during postmaster or standalone-backend shutdown
8505  */
8506 void
8507 ShutdownXLOG(int code, Datum arg)
8508 {
8509         /*
8510          * We should have an aux process resource owner to use, and we should not
8511          * be in a transaction that's installed some other resowner.
8512          */
8513         Assert(AuxProcessResourceOwner != NULL);
8514         Assert(CurrentResourceOwner == NULL ||
8515                    CurrentResourceOwner == AuxProcessResourceOwner);
8516         CurrentResourceOwner = AuxProcessResourceOwner;
8517
8518         /* Don't be chatty in standalone mode */
8519         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8520                         (errmsg("shutting down")));
8521
8522         /*
8523          * Signal walsenders to move to stopping state.
8524          */
8525         WalSndInitStopping();
8526
8527         /*
8528          * Wait for WAL senders to be in stopping state.  This prevents commands
8529          * from writing new WAL.
8530          */
8531         WalSndWaitStopping();
8532
8533         if (RecoveryInProgress())
8534                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8535         else
8536         {
8537                 /*
8538                  * If archiving is enabled, rotate the last XLOG file so that all the
8539                  * remaining records are archived (postmaster wakes up the archiver
8540                  * process one more time at the end of shutdown). The checkpoint
8541                  * record will go to the next XLOG file and won't be archived (yet).
8542                  */
8543                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8544                         RequestXLogSwitch(false);
8545
8546                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8547         }
8548         ShutdownCLOG();
8549         ShutdownCommitTs();
8550         ShutdownSUBTRANS();
8551         ShutdownMultiXact();
8552 }
8553
8554 /*
8555  * Log start of a checkpoint.
8556  */
8557 static void
8558 LogCheckpointStart(int flags, bool restartpoint)
8559 {
8560         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8561                  restartpoint ? "restartpoint" : "checkpoint",
8562                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8563                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8564                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8565                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8566                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8567                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8568                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8569                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8570 }
8571
8572 /*
8573  * Log end of a checkpoint.
8574  */
8575 static void
8576 LogCheckpointEnd(bool restartpoint)
8577 {
8578         long            write_secs,
8579                                 sync_secs,
8580                                 total_secs,
8581                                 longest_secs,
8582                                 average_secs;
8583         int                     write_usecs,
8584                                 sync_usecs,
8585                                 total_usecs,
8586                                 longest_usecs,
8587                                 average_usecs;
8588         uint64          average_sync_time;
8589
8590         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8591
8592         TimestampDifference(CheckpointStats.ckpt_write_t,
8593                                                 CheckpointStats.ckpt_sync_t,
8594                                                 &write_secs, &write_usecs);
8595
8596         TimestampDifference(CheckpointStats.ckpt_sync_t,
8597                                                 CheckpointStats.ckpt_sync_end_t,
8598                                                 &sync_secs, &sync_usecs);
8599
8600         /* Accumulate checkpoint timing summary data, in milliseconds. */
8601         BgWriterStats.m_checkpoint_write_time +=
8602                 write_secs * 1000 + write_usecs / 1000;
8603         BgWriterStats.m_checkpoint_sync_time +=
8604                 sync_secs * 1000 + sync_usecs / 1000;
8605
8606         /*
8607          * All of the published timing statistics are accounted for.  Only
8608          * continue if a log message is to be written.
8609          */
8610         if (!log_checkpoints)
8611                 return;
8612
8613         TimestampDifference(CheckpointStats.ckpt_start_t,
8614                                                 CheckpointStats.ckpt_end_t,
8615                                                 &total_secs, &total_usecs);
8616
8617         /*
8618          * Timing values returned from CheckpointStats are in microseconds.
8619          * Convert to the second plus microsecond form that TimestampDifference
8620          * returns for homogeneous printing.
8621          */
8622         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8623         longest_usecs = CheckpointStats.ckpt_longest_sync -
8624                 (uint64) longest_secs * 1000000;
8625
8626         average_sync_time = 0;
8627         if (CheckpointStats.ckpt_sync_rels > 0)
8628                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8629                         CheckpointStats.ckpt_sync_rels;
8630         average_secs = (long) (average_sync_time / 1000000);
8631         average_usecs = average_sync_time - (uint64) average_secs * 1000000;
8632
8633         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8634                  "%d WAL file(s) added, %d removed, %d recycled; "
8635                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8636                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8637                  "distance=%d kB, estimate=%d kB",
8638                  restartpoint ? "restartpoint" : "checkpoint",
8639                  CheckpointStats.ckpt_bufs_written,
8640                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8641                  CheckpointStats.ckpt_segs_added,
8642                  CheckpointStats.ckpt_segs_removed,
8643                  CheckpointStats.ckpt_segs_recycled,
8644                  write_secs, write_usecs / 1000,
8645                  sync_secs, sync_usecs / 1000,
8646                  total_secs, total_usecs / 1000,
8647                  CheckpointStats.ckpt_sync_rels,
8648                  longest_secs, longest_usecs / 1000,
8649                  average_secs, average_usecs / 1000,
8650                  (int) (PrevCheckPointDistance / 1024.0),
8651                  (int) (CheckPointDistanceEstimate / 1024.0));
8652 }
8653
8654 /*
8655  * Update the estimate of distance between checkpoints.
8656  *
8657  * The estimate is used to calculate the number of WAL segments to keep
8658  * preallocated, see XLOGFileSlop().
8659  */
8660 static void
8661 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8662 {
8663         /*
8664          * To estimate the number of segments consumed between checkpoints, keep a
8665          * moving average of the amount of WAL generated in previous checkpoint
8666          * cycles. However, if the load is bursty, with quiet periods and busy
8667          * periods, we want to cater for the peak load. So instead of a plain
8668          * moving average, let the average decline slowly if the previous cycle
8669          * used less WAL than estimated, but bump it up immediately if it used
8670          * more.
8671          *
8672          * When checkpoints are triggered by max_wal_size, this should converge to
8673          * CheckpointSegments * wal_segment_size,
8674          *
8675          * Note: This doesn't pay any attention to what caused the checkpoint.
8676          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8677          * starting a base backup, are counted the same as those created
8678          * automatically. The slow-decline will largely mask them out, if they are
8679          * not frequent. If they are frequent, it seems reasonable to count them
8680          * in as any others; if you issue a manual checkpoint every 5 minutes and
8681          * never let a timed checkpoint happen, it makes sense to base the
8682          * preallocation on that 5 minute interval rather than whatever
8683          * checkpoint_timeout is set to.
8684          */
8685         PrevCheckPointDistance = nbytes;
8686         if (CheckPointDistanceEstimate < nbytes)
8687                 CheckPointDistanceEstimate = nbytes;
8688         else
8689                 CheckPointDistanceEstimate =
8690                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8691 }
8692
8693 /*
8694  * Perform a checkpoint --- either during shutdown, or on-the-fly
8695  *
8696  * flags is a bitwise OR of the following:
8697  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8698  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8699  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8700  *              ignoring checkpoint_completion_target parameter.
8701  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8702  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8703  *              CHECKPOINT_END_OF_RECOVERY).
8704  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8705  *
8706  * Note: flags contains other bits, of interest here only for logging purposes.
8707  * In particular note that this routine is synchronous and does not pay
8708  * attention to CHECKPOINT_WAIT.
8709  *
8710  * If !shutdown then we are writing an online checkpoint. This is a very special
8711  * kind of operation and WAL record because the checkpoint action occurs over
8712  * a period of time yet logically occurs at just a single LSN. The logical
8713  * position of the WAL record (redo ptr) is the same or earlier than the
8714  * physical position. When we replay WAL we locate the checkpoint via its
8715  * physical position then read the redo ptr and actually start replay at the
8716  * earlier logical position. Note that we don't write *anything* to WAL at
8717  * the logical position, so that location could be any other kind of WAL record.
8718  * All of this mechanism allows us to continue working while we checkpoint.
8719  * As a result, timing of actions is critical here and be careful to note that
8720  * this function will likely take minutes to execute on a busy system.
8721  */
8722 void
8723 CreateCheckPoint(int flags)
8724 {
8725         bool            shutdown;
8726         CheckPoint      checkPoint;
8727         XLogRecPtr      recptr;
8728         XLogSegNo       _logSegNo;
8729         XLogCtlInsert *Insert = &XLogCtl->Insert;
8730         uint32          freespace;
8731         XLogRecPtr      PriorRedoPtr;
8732         XLogRecPtr      curInsert;
8733         XLogRecPtr      last_important_lsn;
8734         VirtualTransactionId *vxids;
8735         int                     nvxids;
8736
8737         /*
8738          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8739          * issued at a different time.
8740          */
8741         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8742                 shutdown = true;
8743         else
8744                 shutdown = false;
8745
8746         /* sanity check */
8747         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8748                 elog(ERROR, "can't create a checkpoint during recovery");
8749
8750         /*
8751          * Initialize InitXLogInsert working areas before entering the critical
8752          * section.  Normally, this is done by the first call to
8753          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8754          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8755          * done below in a critical section, and InitXLogInsert cannot be called
8756          * in a critical section.
8757          */
8758         InitXLogInsert();
8759
8760         /*
8761          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8762          * (This is just pro forma, since in the present system structure there is
8763          * only one process that is allowed to issue checkpoints at any given
8764          * time.)
8765          */
8766         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8767
8768         /*
8769          * Prepare to accumulate statistics.
8770          *
8771          * Note: because it is possible for log_checkpoints to change while a
8772          * checkpoint proceeds, we always accumulate stats, even if
8773          * log_checkpoints is currently off.
8774          */
8775         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8776         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8777
8778         /*
8779          * Use a critical section to force system panic if we have trouble.
8780          */
8781         START_CRIT_SECTION();
8782
8783         if (shutdown)
8784         {
8785                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8786                 ControlFile->state = DB_SHUTDOWNING;
8787                 ControlFile->time = (pg_time_t) time(NULL);
8788                 UpdateControlFile();
8789                 LWLockRelease(ControlFileLock);
8790         }
8791
8792         /*
8793          * Let smgr prepare for checkpoint; this has to happen before we determine
8794          * the REDO pointer.  Note that smgr must not do anything that'd have to
8795          * be undone if we decide no checkpoint is needed.
8796          */
8797         smgrpreckpt();
8798
8799         /* Begin filling in the checkpoint WAL record */
8800         MemSet(&checkPoint, 0, sizeof(checkPoint));
8801         checkPoint.time = (pg_time_t) time(NULL);
8802
8803         /*
8804          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8805          * pointer. This allows us to begin accumulating changes to assemble our
8806          * starting snapshot of locks and transactions.
8807          */
8808         if (!shutdown && XLogStandbyInfoActive())
8809                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8810         else
8811                 checkPoint.oldestActiveXid = InvalidTransactionId;
8812
8813         /*
8814          * Get location of last important record before acquiring insert locks (as
8815          * GetLastImportantRecPtr() also locks WAL locks).
8816          */
8817         last_important_lsn = GetLastImportantRecPtr();
8818
8819         /*
8820          * We must block concurrent insertions while examining insert state to
8821          * determine the checkpoint REDO pointer.
8822          */
8823         WALInsertLockAcquireExclusive();
8824         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8825
8826         /*
8827          * If this isn't a shutdown or forced checkpoint, and if there has been no
8828          * WAL activity requiring a checkpoint, skip it.  The idea here is to
8829          * avoid inserting duplicate checkpoints when the system is idle.
8830          */
8831         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8832                                   CHECKPOINT_FORCE)) == 0)
8833         {
8834                 if (last_important_lsn == ControlFile->checkPoint)
8835                 {
8836                         WALInsertLockRelease();
8837                         LWLockRelease(CheckpointLock);
8838                         END_CRIT_SECTION();
8839                         ereport(DEBUG1,
8840                                         (errmsg("checkpoint skipped because system is idle")));
8841                         return;
8842                 }
8843         }
8844
8845         /*
8846          * An end-of-recovery checkpoint is created before anyone is allowed to
8847          * write WAL. To allow us to write the checkpoint record, temporarily
8848          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8849          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8850          */
8851         if (flags & CHECKPOINT_END_OF_RECOVERY)
8852                 LocalSetXLogInsertAllowed();
8853
8854         checkPoint.ThisTimeLineID = ThisTimeLineID;
8855         if (flags & CHECKPOINT_END_OF_RECOVERY)
8856                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8857         else
8858                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8859
8860         checkPoint.fullPageWrites = Insert->fullPageWrites;
8861
8862         /*
8863          * Compute new REDO record ptr = location of next XLOG record.
8864          *
8865          * NB: this is NOT necessarily where the checkpoint record itself will be,
8866          * since other backends may insert more XLOG records while we're off doing
8867          * the buffer flush work.  Those XLOG records are logically after the
8868          * checkpoint, even though physically before it.  Got that?
8869          */
8870         freespace = INSERT_FREESPACE(curInsert);
8871         if (freespace == 0)
8872         {
8873                 if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8874                         curInsert += SizeOfXLogLongPHD;
8875                 else
8876                         curInsert += SizeOfXLogShortPHD;
8877         }
8878         checkPoint.redo = curInsert;
8879
8880         /*
8881          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8882          * must be done while holding all the insertion locks.
8883          *
8884          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8885          * pointing past where it really needs to point.  This is okay; the only
8886          * consequence is that XLogInsert might back up whole buffers that it
8887          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8888          * XLogInserts that happen while we are dumping buffers must assume that
8889          * their buffer changes are not included in the checkpoint.
8890          */
8891         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8892
8893         /*
8894          * Now we can release the WAL insertion locks, allowing other xacts to
8895          * proceed while we are flushing disk buffers.
8896          */
8897         WALInsertLockRelease();
8898
8899         /* Update the info_lck-protected copy of RedoRecPtr as well */
8900         SpinLockAcquire(&XLogCtl->info_lck);
8901         XLogCtl->RedoRecPtr = checkPoint.redo;
8902         SpinLockRelease(&XLogCtl->info_lck);
8903
8904         /*
8905          * If enabled, log checkpoint start.  We postpone this until now so as not
8906          * to log anything if we decided to skip the checkpoint.
8907          */
8908         if (log_checkpoints)
8909                 LogCheckpointStart(flags, false);
8910
8911         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8912
8913         /*
8914          * Get the other info we need for the checkpoint record.
8915          *
8916          * We don't need to save oldestClogXid in the checkpoint, it only matters
8917          * for the short period in which clog is being truncated, and if we crash
8918          * during that we'll redo the clog truncation and fix up oldestClogXid
8919          * there.
8920          */
8921         LWLockAcquire(XidGenLock, LW_SHARED);
8922         checkPoint.nextXid = ShmemVariableCache->nextXid;
8923         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8924         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8925         LWLockRelease(XidGenLock);
8926
8927         LWLockAcquire(CommitTsLock, LW_SHARED);
8928         checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8929         checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8930         LWLockRelease(CommitTsLock);
8931
8932         /* Increase XID epoch if we've wrapped around since last checkpoint */
8933         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8934         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8935                 checkPoint.nextXidEpoch++;
8936
8937         LWLockAcquire(OidGenLock, LW_SHARED);
8938         checkPoint.nextOid = ShmemVariableCache->nextOid;
8939         if (!shutdown)
8940                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8941         LWLockRelease(OidGenLock);
8942
8943         MultiXactGetCheckptMulti(shutdown,
8944                                                          &checkPoint.nextMulti,
8945                                                          &checkPoint.nextMultiOffset,
8946                                                          &checkPoint.oldestMulti,
8947                                                          &checkPoint.oldestMultiDB);
8948
8949         /*
8950          * Having constructed the checkpoint record, ensure all shmem disk buffers
8951          * and commit-log buffers are flushed to disk.
8952          *
8953          * This I/O could fail for various reasons.  If so, we will fail to
8954          * complete the checkpoint, but there is no reason to force a system
8955          * panic. Accordingly, exit critical section while doing it.
8956          */
8957         END_CRIT_SECTION();
8958
8959         /*
8960          * In some cases there are groups of actions that must all occur on one
8961          * side or the other of a checkpoint record. Before flushing the
8962          * checkpoint record we must explicitly wait for any backend currently
8963          * performing those groups of actions.
8964          *
8965          * One example is end of transaction, so we must wait for any transactions
8966          * that are currently in commit critical sections.  If an xact inserted
8967          * its commit record into XLOG just before the REDO point, then a crash
8968          * restart from the REDO point would not replay that record, which means
8969          * that our flushing had better include the xact's update of pg_xact.  So
8970          * we wait till he's out of his commit critical section before proceeding.
8971          * See notes in RecordTransactionCommit().
8972          *
8973          * Because we've already released the insertion locks, this test is a bit
8974          * fuzzy: it is possible that we will wait for xacts we didn't really need
8975          * to wait for.  But the delay should be short and it seems better to make
8976          * checkpoint take a bit longer than to hold off insertions longer than
8977          * necessary. (In fact, the whole reason we have this issue is that xact.c
8978          * does commit record XLOG insertion and clog update as two separate steps
8979          * protected by different locks, but again that seems best on grounds of
8980          * minimizing lock contention.)
8981          *
8982          * A transaction that has not yet set delayChkpt when we look cannot be at
8983          * risk, since he's not inserted his commit record yet; and one that's
8984          * already cleared it is not at risk either, since he's done fixing clog
8985          * and we will correctly flush the update below.  So we cannot miss any
8986          * xacts we need to wait for.
8987          */
8988         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8989         if (nvxids > 0)
8990         {
8991                 do
8992                 {
8993                         pg_usleep(10000L);      /* wait for 10 msec */
8994                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8995         }
8996         pfree(vxids);
8997
8998         CheckPointGuts(checkPoint.redo, flags);
8999
9000         /*
9001          * Take a snapshot of running transactions and write this to WAL. This
9002          * allows us to reconstruct the state of running transactions during
9003          * archive recovery, if required. Skip, if this info disabled.
9004          *
9005          * If we are shutting down, or Startup process is completing crash
9006          * recovery we don't need to write running xact data.
9007          */
9008         if (!shutdown && XLogStandbyInfoActive())
9009                 LogStandbySnapshot();
9010
9011         START_CRIT_SECTION();
9012
9013         /*
9014          * Now insert the checkpoint record into XLOG.
9015          */
9016         XLogBeginInsert();
9017         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9018         recptr = XLogInsert(RM_XLOG_ID,
9019                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9020                                                 XLOG_CHECKPOINT_ONLINE);
9021
9022         XLogFlush(recptr);
9023
9024         /*
9025          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9026          * overwritten at next startup.  No-one should even try, this just allows
9027          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
9028          * to just temporarily disable writing until the system has exited
9029          * recovery.
9030          */
9031         if (shutdown)
9032         {
9033                 if (flags & CHECKPOINT_END_OF_RECOVERY)
9034                         LocalXLogInsertAllowed = -1;    /* return to "check" state */
9035                 else
9036                         LocalXLogInsertAllowed = 0; /* never again write WAL */
9037         }
9038
9039         /*
9040          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9041          * = end of actual checkpoint record.
9042          */
9043         if (shutdown && checkPoint.redo != ProcLastRecPtr)
9044                 ereport(PANIC,
9045                                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
9046
9047         /*
9048          * Remember the prior checkpoint's redo ptr for
9049          * UpdateCheckPointDistanceEstimate()
9050          */
9051         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9052
9053         /*
9054          * Update the control file.
9055          */
9056         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9057         if (shutdown)
9058                 ControlFile->state = DB_SHUTDOWNED;
9059         ControlFile->checkPoint = ProcLastRecPtr;
9060         ControlFile->checkPointCopy = checkPoint;
9061         ControlFile->time = (pg_time_t) time(NULL);
9062         /* crash recovery should always recover to the end of WAL */
9063         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9064         ControlFile->minRecoveryPointTLI = 0;
9065
9066         /*
9067          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9068          * unused on non-shutdown checkpoints, but seems useful to store it always
9069          * for debugging purposes.
9070          */
9071         SpinLockAcquire(&XLogCtl->ulsn_lck);
9072         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9073         SpinLockRelease(&XLogCtl->ulsn_lck);
9074
9075         UpdateControlFile();
9076         LWLockRelease(ControlFileLock);
9077
9078         /* Update shared-memory copy of checkpoint XID/epoch */
9079         SpinLockAcquire(&XLogCtl->info_lck);
9080         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9081         XLogCtl->ckptXid = checkPoint.nextXid;
9082         SpinLockRelease(&XLogCtl->info_lck);
9083
9084         /*
9085          * We are now done with critical updates; no need for system panic if we
9086          * have trouble while fooling with old log segments.
9087          */
9088         END_CRIT_SECTION();
9089
9090         /*
9091          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9092          */
9093         smgrpostckpt();
9094
9095         /*
9096          * Update the average distance between checkpoints if the prior checkpoint
9097          * exists.
9098          */
9099         if (PriorRedoPtr != InvalidXLogRecPtr)
9100                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9101
9102         /*
9103          * Delete old log files, those no longer needed for last checkpoint to
9104          * prevent the disk holding the xlog from growing full.
9105          */
9106         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9107         KeepLogSeg(recptr, &_logSegNo);
9108         _logSegNo--;
9109         RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9110
9111         /*
9112          * Make more log segments if needed.  (Do this after recycling old log
9113          * segments, since that may supply some of the needed files.)
9114          */
9115         if (!shutdown)
9116                 PreallocXlogFiles(recptr);
9117
9118         /*
9119          * Truncate pg_subtrans if possible.  We can throw away all data before
9120          * the oldest XMIN of any running transaction.  No future transaction will
9121          * attempt to reference any pg_subtrans entry older than that (see Asserts
9122          * in subtrans.c).  During recovery, though, we mustn't do this because
9123          * StartupSUBTRANS hasn't been called yet.
9124          */
9125         if (!RecoveryInProgress())
9126                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9127
9128         /* Real work is done, but log and update stats before releasing lock. */
9129         LogCheckpointEnd(false);
9130
9131         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9132                                                                          NBuffers,
9133                                                                          CheckpointStats.ckpt_segs_added,
9134                                                                          CheckpointStats.ckpt_segs_removed,
9135                                                                          CheckpointStats.ckpt_segs_recycled);
9136
9137         LWLockRelease(CheckpointLock);
9138 }
9139
9140 /*
9141  * Mark the end of recovery in WAL though without running a full checkpoint.
9142  * We can expect that a restartpoint is likely to be in progress as we
9143  * do this, though we are unwilling to wait for it to complete. So be
9144  * careful to avoid taking the CheckpointLock anywhere here.
9145  *
9146  * CreateRestartPoint() allows for the case where recovery may end before
9147  * the restartpoint completes so there is no concern of concurrent behaviour.
9148  */
9149 static void
9150 CreateEndOfRecoveryRecord(void)
9151 {
9152         xl_end_of_recovery xlrec;
9153         XLogRecPtr      recptr;
9154
9155         /* sanity check */
9156         if (!RecoveryInProgress())
9157                 elog(ERROR, "can only be used to end recovery");
9158
9159         xlrec.end_time = GetCurrentTimestamp();
9160
9161         WALInsertLockAcquireExclusive();
9162         xlrec.ThisTimeLineID = ThisTimeLineID;
9163         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9164         WALInsertLockRelease();
9165
9166         LocalSetXLogInsertAllowed();
9167
9168         START_CRIT_SECTION();
9169
9170         XLogBeginInsert();
9171         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9172         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9173
9174         XLogFlush(recptr);
9175
9176         /*
9177          * Update the control file so that crash recovery can follow the timeline
9178          * changes to this point.
9179          */
9180         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9181         ControlFile->time = (pg_time_t) time(NULL);
9182         ControlFile->minRecoveryPoint = recptr;
9183         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9184         UpdateControlFile();
9185         LWLockRelease(ControlFileLock);
9186
9187         END_CRIT_SECTION();
9188
9189         LocalXLogInsertAllowed = -1;    /* return to "check" state */
9190 }
9191
9192 /*
9193  * Flush all data in shared memory to disk, and fsync
9194  *
9195  * This is the common code shared between regular checkpoints and
9196  * recovery restartpoints.
9197  */
9198 static void
9199 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9200 {
9201         CheckPointCLOG();
9202         CheckPointCommitTs();
9203         CheckPointSUBTRANS();
9204         CheckPointMultiXact();
9205         CheckPointPredicate();
9206         CheckPointRelationMap();
9207         CheckPointReplicationSlots();
9208         CheckPointSnapBuild();
9209         CheckPointLogicalRewriteHeap();
9210         CheckPointBuffers(flags);       /* performs all required fsyncs */
9211         CheckPointReplicationOrigin();
9212         /* We deliberately delay 2PC checkpointing as long as possible */
9213         CheckPointTwoPhase(checkPointRedo);
9214 }
9215
9216 /*
9217  * Save a checkpoint for recovery restart if appropriate
9218  *
9219  * This function is called each time a checkpoint record is read from XLOG.
9220  * It must determine whether the checkpoint represents a safe restartpoint or
9221  * not.  If so, the checkpoint record is stashed in shared memory so that
9222  * CreateRestartPoint can consult it.  (Note that the latter function is
9223  * executed by the checkpointer, while this one will be executed by the
9224  * startup process.)
9225  */
9226 static void
9227 RecoveryRestartPoint(const CheckPoint *checkPoint)
9228 {
9229         /*
9230          * Also refrain from creating a restartpoint if we have seen any
9231          * references to non-existent pages. Restarting recovery from the
9232          * restartpoint would not see the references, so we would lose the
9233          * cross-check that the pages belonged to a relation that was dropped
9234          * later.
9235          */
9236         if (XLogHaveInvalidPages())
9237         {
9238                 elog(trace_recovery(DEBUG2),
9239                          "could not record restart point at %X/%X because there "
9240                          "are unresolved references to invalid pages",
9241                          (uint32) (checkPoint->redo >> 32),
9242                          (uint32) checkPoint->redo);
9243                 return;
9244         }
9245
9246         /*
9247          * Copy the checkpoint record to shared memory, so that checkpointer can
9248          * work out the next time it wants to perform a restartpoint.
9249          */
9250         SpinLockAcquire(&XLogCtl->info_lck);
9251         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9252         XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9253         XLogCtl->lastCheckPoint = *checkPoint;
9254         SpinLockRelease(&XLogCtl->info_lck);
9255 }
9256
9257 /*
9258  * Establish a restartpoint if possible.
9259  *
9260  * This is similar to CreateCheckPoint, but is used during WAL recovery
9261  * to establish a point from which recovery can roll forward without
9262  * replaying the entire recovery log.
9263  *
9264  * Returns true if a new restartpoint was established. We can only establish
9265  * a restartpoint if we have replayed a safe checkpoint record since last
9266  * restartpoint.
9267  */
9268 bool
9269 CreateRestartPoint(int flags)
9270 {
9271         XLogRecPtr      lastCheckPointRecPtr;
9272         XLogRecPtr      lastCheckPointEndPtr;
9273         CheckPoint      lastCheckPoint;
9274         XLogRecPtr      PriorRedoPtr;
9275         XLogRecPtr      receivePtr;
9276         XLogRecPtr      replayPtr;
9277         TimeLineID      replayTLI;
9278         XLogRecPtr      endptr;
9279         XLogSegNo       _logSegNo;
9280         TimestampTz xtime;
9281
9282         /*
9283          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9284          * happens at a time.
9285          */
9286         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9287
9288         /* Get a local copy of the last safe checkpoint record. */
9289         SpinLockAcquire(&XLogCtl->info_lck);
9290         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9291         lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9292         lastCheckPoint = XLogCtl->lastCheckPoint;
9293         SpinLockRelease(&XLogCtl->info_lck);
9294
9295         /*
9296          * Check that we're still in recovery mode. It's ok if we exit recovery
9297          * mode after this check, the restart point is valid anyway.
9298          */
9299         if (!RecoveryInProgress())
9300         {
9301                 ereport(DEBUG2,
9302                                 (errmsg("skipping restartpoint, recovery has already ended")));
9303                 LWLockRelease(CheckpointLock);
9304                 return false;
9305         }
9306
9307         /*
9308          * If the last checkpoint record we've replayed is already our last
9309          * restartpoint, we can't perform a new restart point. We still update
9310          * minRecoveryPoint in that case, so that if this is a shutdown restart
9311          * point, we won't start up earlier than before. That's not strictly
9312          * necessary, but when hot standby is enabled, it would be rather weird if
9313          * the database opened up for read-only connections at a point-in-time
9314          * before the last shutdown. Such time travel is still possible in case of
9315          * immediate shutdown, though.
9316          *
9317          * We don't explicitly advance minRecoveryPoint when we do create a
9318          * restartpoint. It's assumed that flushing the buffers will do that as a
9319          * side-effect.
9320          */
9321         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9322                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9323         {
9324                 ereport(DEBUG2,
9325                                 (errmsg("skipping restartpoint, already performed at %X/%X",
9326                                                 (uint32) (lastCheckPoint.redo >> 32),
9327                                                 (uint32) lastCheckPoint.redo)));
9328
9329                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9330                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9331                 {
9332                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9333                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9334                         ControlFile->time = (pg_time_t) time(NULL);
9335                         UpdateControlFile();
9336                         LWLockRelease(ControlFileLock);
9337                 }
9338                 LWLockRelease(CheckpointLock);
9339                 return false;
9340         }
9341
9342         /*
9343          * Update the shared RedoRecPtr so that the startup process can calculate
9344          * the number of segments replayed since last restartpoint, and request a
9345          * restartpoint if it exceeds CheckPointSegments.
9346          *
9347          * Like in CreateCheckPoint(), hold off insertions to update it, although
9348          * during recovery this is just pro forma, because no WAL insertions are
9349          * happening.
9350          */
9351         WALInsertLockAcquireExclusive();
9352         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9353         WALInsertLockRelease();
9354
9355         /* Also update the info_lck-protected copy */
9356         SpinLockAcquire(&XLogCtl->info_lck);
9357         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9358         SpinLockRelease(&XLogCtl->info_lck);
9359
9360         /*
9361          * Prepare to accumulate statistics.
9362          *
9363          * Note: because it is possible for log_checkpoints to change while a
9364          * checkpoint proceeds, we always accumulate stats, even if
9365          * log_checkpoints is currently off.
9366          */
9367         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9368         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9369
9370         if (log_checkpoints)
9371                 LogCheckpointStart(flags, true);
9372
9373         CheckPointGuts(lastCheckPoint.redo, flags);
9374
9375         /*
9376          * Remember the prior checkpoint's redo ptr for
9377          * UpdateCheckPointDistanceEstimate()
9378          */
9379         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9380
9381         /*
9382          * Update pg_control, using current time.  Check that it still shows
9383          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9384          * this is a quick hack to make sure nothing really bad happens if somehow
9385          * we get here after the end-of-recovery checkpoint.
9386          */
9387         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9388         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9389                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9390         {
9391                 ControlFile->checkPoint = lastCheckPointRecPtr;
9392                 ControlFile->checkPointCopy = lastCheckPoint;
9393                 ControlFile->time = (pg_time_t) time(NULL);
9394
9395                 /*
9396                  * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9397                  * this will have happened already while writing out dirty buffers,
9398                  * but not necessarily - e.g. because no buffers were dirtied.  We do
9399                  * this because a non-exclusive base backup uses minRecoveryPoint to
9400                  * determine which WAL files must be included in the backup, and the
9401                  * file (or files) containing the checkpoint record must be included,
9402                  * at a minimum. Note that for an ordinary restart of recovery there's
9403                  * no value in having the minimum recovery point any earlier than this
9404                  * anyway, because redo will begin just after the checkpoint record.
9405                  */
9406                 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9407                 {
9408                         ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9409                         ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9410
9411                         /* update local copy */
9412                         minRecoveryPoint = ControlFile->minRecoveryPoint;
9413                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9414                 }
9415                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9416                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9417                 UpdateControlFile();
9418         }
9419         LWLockRelease(ControlFileLock);
9420
9421         /*
9422          * Update the average distance between checkpoints/restartpoints if the
9423          * prior checkpoint exists.
9424          */
9425         if (PriorRedoPtr != InvalidXLogRecPtr)
9426                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9427
9428         /*
9429          * Delete old log files, those no longer needed for last restartpoint to
9430          * prevent the disk holding the xlog from growing full.
9431          */
9432         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9433
9434         /*
9435          * Retreat _logSegNo using the current end of xlog replayed or received,
9436          * whichever is later.
9437          */
9438         receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9439         replayPtr = GetXLogReplayRecPtr(&replayTLI);
9440         endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9441         KeepLogSeg(endptr, &_logSegNo);
9442         _logSegNo--;
9443
9444         /*
9445          * Try to recycle segments on a useful timeline. If we've been promoted
9446          * since the beginning of this restartpoint, use the new timeline chosen
9447          * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9448          * case). If we're still in recovery, use the timeline we're currently
9449          * replaying.
9450          *
9451          * There is no guarantee that the WAL segments will be useful on the
9452          * current timeline; if recovery proceeds to a new timeline right after
9453          * this, the pre-allocated WAL segments on this timeline will not be used,
9454          * and will go wasted until recycled on the next restartpoint. We'll live
9455          * with that.
9456          */
9457         if (RecoveryInProgress())
9458                 ThisTimeLineID = replayTLI;
9459
9460         RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9461
9462         /*
9463          * Make more log segments if needed.  (Do this after recycling old log
9464          * segments, since that may supply some of the needed files.)
9465          */
9466         PreallocXlogFiles(endptr);
9467
9468         /*
9469          * ThisTimeLineID is normally not set when we're still in recovery.
9470          * However, recycling/preallocating segments above needed ThisTimeLineID
9471          * to determine which timeline to install the segments on. Reset it now,
9472          * to restore the normal state of affairs for debugging purposes.
9473          */
9474         if (RecoveryInProgress())
9475                 ThisTimeLineID = 0;
9476
9477         /*
9478          * Truncate pg_subtrans if possible.  We can throw away all data before
9479          * the oldest XMIN of any running transaction.  No future transaction will
9480          * attempt to reference any pg_subtrans entry older than that (see Asserts
9481          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9482          * this because StartupSUBTRANS hasn't been called yet.
9483          */
9484         if (EnableHotStandby)
9485                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9486
9487         /* Real work is done, but log and update before releasing lock. */
9488         LogCheckpointEnd(true);
9489
9490         xtime = GetLatestXTime();
9491         ereport((log_checkpoints ? LOG : DEBUG2),
9492                         (errmsg("recovery restart point at %X/%X",
9493                                         (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9494                          xtime ? errdetail("Last completed transaction was at log time %s.",
9495                                                            timestamptz_to_str(xtime)) : 0));
9496
9497         LWLockRelease(CheckpointLock);
9498
9499         /*
9500          * Finally, execute archive_cleanup_command, if any.
9501          */
9502         if (XLogCtl->archiveCleanupCommand[0])
9503                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9504                                                            "archive_cleanup_command",
9505                                                            false);
9506
9507         return true;
9508 }
9509
9510 /*
9511  * Retreat *logSegNo to the last segment that we need to retain because of
9512  * either wal_keep_segments or replication slots.
9513  *
9514  * This is calculated by subtracting wal_keep_segments from the given xlog
9515  * location, recptr and by making sure that that result is below the
9516  * requirement of replication slots.
9517  */
9518 static void
9519 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9520 {
9521         XLogSegNo       segno;
9522         XLogRecPtr      keep;
9523
9524         XLByteToSeg(recptr, segno, wal_segment_size);
9525         keep = XLogGetReplicationSlotMinimumLSN();
9526
9527         /* compute limit for wal_keep_segments first */
9528         if (wal_keep_segments > 0)
9529         {
9530                 /* avoid underflow, don't go below 1 */
9531                 if (segno <= wal_keep_segments)
9532                         segno = 1;
9533                 else
9534                         segno = segno - wal_keep_segments;
9535         }
9536
9537         /* then check whether slots limit removal further */
9538         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9539         {
9540                 XLogSegNo       slotSegNo;
9541
9542                 XLByteToSeg(keep, slotSegNo, wal_segment_size);
9543
9544                 if (slotSegNo <= 0)
9545                         segno = 1;
9546                 else if (slotSegNo < segno)
9547                         segno = slotSegNo;
9548         }
9549
9550         /* don't delete WAL segments newer than the calculated segment */
9551         if (segno < *logSegNo)
9552                 *logSegNo = segno;
9553 }
9554
9555 /*
9556  * Write a NEXTOID log record
9557  */
9558 void
9559 XLogPutNextOid(Oid nextOid)
9560 {
9561         XLogBeginInsert();
9562         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9563         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9564
9565         /*
9566          * We need not flush the NEXTOID record immediately, because any of the
9567          * just-allocated OIDs could only reach disk as part of a tuple insert or
9568          * update that would have its own XLOG record that must follow the NEXTOID
9569          * record.  Therefore, the standard buffer LSN interlock applied to those
9570          * records will ensure no such OID reaches disk before the NEXTOID record
9571          * does.
9572          *
9573          * Note, however, that the above statement only covers state "within" the
9574          * database.  When we use a generated OID as a file or directory name, we
9575          * are in a sense violating the basic WAL rule, because that filesystem
9576          * change may reach disk before the NEXTOID WAL record does.  The impact
9577          * of this is that if a database crash occurs immediately afterward, we
9578          * might after restart re-generate the same OID and find that it conflicts
9579          * with the leftover file or directory.  But since for safety's sake we
9580          * always loop until finding a nonconflicting filename, this poses no real
9581          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9582          */
9583 }
9584
9585 /*
9586  * Write an XLOG SWITCH record.
9587  *
9588  * Here we just blindly issue an XLogInsert request for the record.
9589  * All the magic happens inside XLogInsert.
9590  *
9591  * The return value is either the end+1 address of the switch record,
9592  * or the end+1 address of the prior segment if we did not need to
9593  * write a switch record because we are already at segment start.
9594  */
9595 XLogRecPtr
9596 RequestXLogSwitch(bool mark_unimportant)
9597 {
9598         XLogRecPtr      RecPtr;
9599
9600         /* XLOG SWITCH has no data */
9601         XLogBeginInsert();
9602
9603         if (mark_unimportant)
9604                 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9605         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9606
9607         return RecPtr;
9608 }
9609
9610 /*
9611  * Write a RESTORE POINT record
9612  */
9613 XLogRecPtr
9614 XLogRestorePoint(const char *rpName)
9615 {
9616         XLogRecPtr      RecPtr;
9617         xl_restore_point xlrec;
9618
9619         xlrec.rp_time = GetCurrentTimestamp();
9620         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9621
9622         XLogBeginInsert();
9623         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9624
9625         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9626
9627         ereport(LOG,
9628                         (errmsg("restore point \"%s\" created at %X/%X",
9629                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9630
9631         return RecPtr;
9632 }
9633
9634 /*
9635  * Check if any of the GUC parameters that are critical for hot standby
9636  * have changed, and update the value in pg_control file if necessary.
9637  */
9638 static void
9639 XLogReportParameters(void)
9640 {
9641         if (wal_level != ControlFile->wal_level ||
9642                 wal_log_hints != ControlFile->wal_log_hints ||
9643                 MaxConnections != ControlFile->MaxConnections ||
9644                 max_worker_processes != ControlFile->max_worker_processes ||
9645                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9646                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9647                 track_commit_timestamp != ControlFile->track_commit_timestamp)
9648         {
9649                 /*
9650                  * The change in number of backend slots doesn't need to be WAL-logged
9651                  * if archiving is not enabled, as you can't start archive recovery
9652                  * with wal_level=minimal anyway. We don't really care about the
9653                  * values in pg_control either if wal_level=minimal, but seems better
9654                  * to keep them up-to-date to avoid confusion.
9655                  */
9656                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9657                 {
9658                         xl_parameter_change xlrec;
9659                         XLogRecPtr      recptr;
9660
9661                         xlrec.MaxConnections = MaxConnections;
9662                         xlrec.max_worker_processes = max_worker_processes;
9663                         xlrec.max_prepared_xacts = max_prepared_xacts;
9664                         xlrec.max_locks_per_xact = max_locks_per_xact;
9665                         xlrec.wal_level = wal_level;
9666                         xlrec.wal_log_hints = wal_log_hints;
9667                         xlrec.track_commit_timestamp = track_commit_timestamp;
9668
9669                         XLogBeginInsert();
9670                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9671
9672                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9673                         XLogFlush(recptr);
9674                 }
9675
9676                 ControlFile->MaxConnections = MaxConnections;
9677                 ControlFile->max_worker_processes = max_worker_processes;
9678                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9679                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9680                 ControlFile->wal_level = wal_level;
9681                 ControlFile->wal_log_hints = wal_log_hints;
9682                 ControlFile->track_commit_timestamp = track_commit_timestamp;
9683                 UpdateControlFile();
9684         }
9685 }
9686
9687 /*
9688  * Update full_page_writes in shared memory, and write an
9689  * XLOG_FPW_CHANGE record if necessary.
9690  *
9691  * Note: this function assumes there is no other process running
9692  * concurrently that could update it.
9693  */
9694 void
9695 UpdateFullPageWrites(void)
9696 {
9697         XLogCtlInsert *Insert = &XLogCtl->Insert;
9698
9699         /*
9700          * Do nothing if full_page_writes has not been changed.
9701          *
9702          * It's safe to check the shared full_page_writes without the lock,
9703          * because we assume that there is no concurrently running process which
9704          * can update it.
9705          */
9706         if (fullPageWrites == Insert->fullPageWrites)
9707                 return;
9708
9709         START_CRIT_SECTION();
9710
9711         /*
9712          * It's always safe to take full page images, even when not strictly
9713          * required, but not the other round. So if we're setting full_page_writes
9714          * to true, first set it true and then write the WAL record. If we're
9715          * setting it to false, first write the WAL record and then set the global
9716          * flag.
9717          */
9718         if (fullPageWrites)
9719         {
9720                 WALInsertLockAcquireExclusive();
9721                 Insert->fullPageWrites = true;
9722                 WALInsertLockRelease();
9723         }
9724
9725         /*
9726          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9727          * full_page_writes during archive recovery, if required.
9728          */
9729         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9730         {
9731                 XLogBeginInsert();
9732                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9733
9734                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9735         }
9736
9737         if (!fullPageWrites)
9738         {
9739                 WALInsertLockAcquireExclusive();
9740                 Insert->fullPageWrites = false;
9741                 WALInsertLockRelease();
9742         }
9743         END_CRIT_SECTION();
9744 }
9745
9746 /*
9747  * Check that it's OK to switch to new timeline during recovery.
9748  *
9749  * 'lsn' is the address of the shutdown checkpoint record we're about to
9750  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9751  */
9752 static void
9753 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9754 {
9755         /* Check that the record agrees on what the current (old) timeline is */
9756         if (prevTLI != ThisTimeLineID)
9757                 ereport(PANIC,
9758                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9759                                                 prevTLI, ThisTimeLineID)));
9760
9761         /*
9762          * The new timeline better be in the list of timelines we expect to see,
9763          * according to the timeline history. It should also not decrease.
9764          */
9765         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9766                 ereport(PANIC,
9767                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9768                                                 newTLI, ThisTimeLineID)));
9769
9770         /*
9771          * If we have not yet reached min recovery point, and we're about to
9772          * switch to a timeline greater than the timeline of the min recovery
9773          * point: trouble. After switching to the new timeline, we could not
9774          * possibly visit the min recovery point on the correct timeline anymore.
9775          * This can happen if there is a newer timeline in the archive that
9776          * branched before the timeline the min recovery point is on, and you
9777          * attempt to do PITR to the new timeline.
9778          */
9779         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9780                 lsn < minRecoveryPoint &&
9781                 newTLI > minRecoveryPointTLI)
9782                 ereport(PANIC,
9783                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9784                                                 newTLI,
9785                                                 (uint32) (minRecoveryPoint >> 32),
9786                                                 (uint32) minRecoveryPoint,
9787                                                 minRecoveryPointTLI)));
9788
9789         /* Looks good */
9790 }
9791
9792 /*
9793  * XLOG resource manager's routines
9794  *
9795  * Definitions of info values are in include/catalog/pg_control.h, though
9796  * not all record types are related to control file updates.
9797  */
9798 void
9799 xlog_redo(XLogReaderState *record)
9800 {
9801         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9802         XLogRecPtr      lsn = record->EndRecPtr;
9803
9804         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9805         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9806                    !XLogRecHasAnyBlockRefs(record));
9807
9808         if (info == XLOG_NEXTOID)
9809         {
9810                 Oid                     nextOid;
9811
9812                 /*
9813                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9814                  * and the recorded nextOid, but that fails if the OID counter wraps
9815                  * around.  Since no OID allocation should be happening during replay
9816                  * anyway, better to just believe the record exactly.  We still take
9817                  * OidGenLock while setting the variable, just in case.
9818                  */
9819                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9820                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9821                 ShmemVariableCache->nextOid = nextOid;
9822                 ShmemVariableCache->oidCount = 0;
9823                 LWLockRelease(OidGenLock);
9824         }
9825         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9826         {
9827                 CheckPoint      checkPoint;
9828
9829                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9830                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9831                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9832                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9833                 LWLockRelease(XidGenLock);
9834                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9835                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9836                 ShmemVariableCache->oidCount = 0;
9837                 LWLockRelease(OidGenLock);
9838                 MultiXactSetNextMXact(checkPoint.nextMulti,
9839                                                           checkPoint.nextMultiOffset);
9840
9841                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9842                                                            checkPoint.oldestMultiDB);
9843
9844                 /*
9845                  * No need to set oldestClogXid here as well; it'll be set when we
9846                  * redo an xl_clog_truncate if it changed since initialization.
9847                  */
9848                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9849
9850                 /*
9851                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9852                  * record, the backup was canceled and the end-of-backup record will
9853                  * never arrive.
9854                  */
9855                 if (ArchiveRecoveryRequested &&
9856                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9857                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9858                         ereport(PANIC,
9859                                         (errmsg("online backup was canceled, recovery cannot continue")));
9860
9861                 /*
9862                  * If we see a shutdown checkpoint, we know that nothing was running
9863                  * on the master at this point. So fake-up an empty running-xacts
9864                  * record and use that here and now. Recover additional standby state
9865                  * for prepared transactions.
9866                  */
9867                 if (standbyState >= STANDBY_INITIALIZED)
9868                 {
9869                         TransactionId *xids;
9870                         int                     nxids;
9871                         TransactionId oldestActiveXID;
9872                         TransactionId latestCompletedXid;
9873                         RunningTransactionsData running;
9874
9875                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9876
9877                         /*
9878                          * Construct a RunningTransactions snapshot representing a shut
9879                          * down server, with only prepared transactions still alive. We're
9880                          * never overflowed at this point because all subxids are listed
9881                          * with their parent prepared transactions.
9882                          */
9883                         running.xcnt = nxids;
9884                         running.subxcnt = 0;
9885                         running.subxid_overflow = false;
9886                         running.nextXid = checkPoint.nextXid;
9887                         running.oldestRunningXid = oldestActiveXID;
9888                         latestCompletedXid = checkPoint.nextXid;
9889                         TransactionIdRetreat(latestCompletedXid);
9890                         Assert(TransactionIdIsNormal(latestCompletedXid));
9891                         running.latestCompletedXid = latestCompletedXid;
9892                         running.xids = xids;
9893
9894                         ProcArrayApplyRecoveryInfo(&running);
9895
9896                         StandbyRecoverPreparedTransactions();
9897                 }
9898
9899                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9900                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9901                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9902
9903                 /* Update shared-memory copy of checkpoint XID/epoch */
9904                 SpinLockAcquire(&XLogCtl->info_lck);
9905                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9906                 XLogCtl->ckptXid = checkPoint.nextXid;
9907                 SpinLockRelease(&XLogCtl->info_lck);
9908
9909                 /*
9910                  * We should've already switched to the new TLI before replaying this
9911                  * record.
9912                  */
9913                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9914                         ereport(PANIC,
9915                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9916                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9917
9918                 RecoveryRestartPoint(&checkPoint);
9919         }
9920         else if (info == XLOG_CHECKPOINT_ONLINE)
9921         {
9922                 CheckPoint      checkPoint;
9923
9924                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9925                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9926                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9927                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9928                                                                   checkPoint.nextXid))
9929                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9930                 LWLockRelease(XidGenLock);
9931
9932                 /*
9933                  * We ignore the nextOid counter in an ONLINE checkpoint, preferring
9934                  * to track OID assignment through XLOG_NEXTOID records.  The nextOid
9935                  * counter is from the start of the checkpoint and might well be stale
9936                  * compared to later XLOG_NEXTOID records.  We could try to take the
9937                  * maximum of the nextOid counter and our latest value, but since
9938                  * there's no particular guarantee about the speed with which the OID
9939                  * counter wraps around, that's a risky thing to do.  In any case,
9940                  * users of the nextOid counter are required to avoid assignment of
9941                  * duplicates, so that a somewhat out-of-date value should be safe.
9942                  */
9943
9944                 /* Handle multixact */
9945                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9946                                                                   checkPoint.nextMultiOffset);
9947
9948                 /*
9949                  * NB: This may perform multixact truncation when replaying WAL
9950                  * generated by an older primary.
9951                  */
9952                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9953                                                            checkPoint.oldestMultiDB);
9954                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9955                                                                   checkPoint.oldestXid))
9956                         SetTransactionIdLimit(checkPoint.oldestXid,
9957                                                                   checkPoint.oldestXidDB);
9958                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9959                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9960                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9961
9962                 /* Update shared-memory copy of checkpoint XID/epoch */
9963                 SpinLockAcquire(&XLogCtl->info_lck);
9964                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9965                 XLogCtl->ckptXid = checkPoint.nextXid;
9966                 SpinLockRelease(&XLogCtl->info_lck);
9967
9968                 /* TLI should not change in an on-line checkpoint */
9969                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9970                         ereport(PANIC,
9971                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9972                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9973
9974                 RecoveryRestartPoint(&checkPoint);
9975         }
9976         else if (info == XLOG_END_OF_RECOVERY)
9977         {
9978                 xl_end_of_recovery xlrec;
9979
9980                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9981
9982                 /*
9983                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9984                  * but this case is rarer and harder to test, so the benefit doesn't
9985                  * outweigh the potential extra cost of maintenance.
9986                  */
9987
9988                 /*
9989                  * We should've already switched to the new TLI before replaying this
9990                  * record.
9991                  */
9992                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9993                         ereport(PANIC,
9994                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9995                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9996         }
9997         else if (info == XLOG_NOOP)
9998         {
9999                 /* nothing to do here */
10000         }
10001         else if (info == XLOG_SWITCH)
10002         {
10003                 /* nothing to do here */
10004         }
10005         else if (info == XLOG_RESTORE_POINT)
10006         {
10007                 /* nothing to do here */
10008         }
10009         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10010         {
10011                 Buffer          buffer;
10012
10013                 /*
10014                  * Full-page image (FPI) records contain nothing else but a backup
10015                  * block. The block reference must include a full-page image -
10016                  * otherwise there would be no point in this record.
10017                  *
10018                  * No recovery conflicts are generated by these generic records - if a
10019                  * resource manager needs to generate conflicts, it has to define a
10020                  * separate WAL record type and redo routine.
10021                  *
10022                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10023                  * WAL- logged because of a hint bit update. They are only generated
10024                  * when checksums are enabled. There is no difference in handling
10025                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10026                  * code just to distinguish them for statistics purposes.
10027                  */
10028                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
10029                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10030                 UnlockReleaseBuffer(buffer);
10031         }
10032         else if (info == XLOG_BACKUP_END)
10033         {
10034                 XLogRecPtr      startpoint;
10035
10036                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10037
10038                 if (ControlFile->backupStartPoint == startpoint)
10039                 {
10040                         /*
10041                          * We have reached the end of base backup, the point where
10042                          * pg_stop_backup() was done. The data on disk is now consistent.
10043                          * Reset backupStartPoint, and update minRecoveryPoint to make
10044                          * sure we don't allow starting up at an earlier point even if
10045                          * recovery is stopped and restarted soon after this.
10046                          */
10047                         elog(DEBUG1, "end of backup reached");
10048
10049                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10050
10051                         if (ControlFile->minRecoveryPoint < lsn)
10052                         {
10053                                 ControlFile->minRecoveryPoint = lsn;
10054                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10055                         }
10056                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
10057                         ControlFile->backupEndRequired = false;
10058                         UpdateControlFile();
10059
10060                         LWLockRelease(ControlFileLock);
10061                 }
10062         }
10063         else if (info == XLOG_PARAMETER_CHANGE)
10064         {
10065                 xl_parameter_change xlrec;
10066
10067                 /* Update our copy of the parameters in pg_control */
10068                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10069
10070                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10071                 ControlFile->MaxConnections = xlrec.MaxConnections;
10072                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10073                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10074                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10075                 ControlFile->wal_level = xlrec.wal_level;
10076                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10077
10078                 /*
10079                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
10080                  * recover back up to this point before allowing hot standby again.
10081                  * This is important if the max_* settings are decreased, to ensure
10082                  * you don't run queries against the WAL preceding the change. The
10083                  * local copies cannot be updated as long as crash recovery is
10084                  * happening and we expect all the WAL to be replayed.
10085                  */
10086                 if (InArchiveRecovery)
10087                 {
10088                         minRecoveryPoint = ControlFile->minRecoveryPoint;
10089                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10090                 }
10091                 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10092                 {
10093                         ControlFile->minRecoveryPoint = lsn;
10094                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10095                 }
10096
10097                 CommitTsParameterChange(xlrec.track_commit_timestamp,
10098                                                                 ControlFile->track_commit_timestamp);
10099                 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10100
10101                 UpdateControlFile();
10102                 LWLockRelease(ControlFileLock);
10103
10104                 /* Check to see if any changes to max_connections give problems */
10105                 CheckRequiredParameterValues();
10106         }
10107         else if (info == XLOG_FPW_CHANGE)
10108         {
10109                 bool            fpw;
10110
10111                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10112
10113                 /*
10114                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10115                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
10116                  * full_page_writes has been disabled during online backup.
10117                  */
10118                 if (!fpw)
10119                 {
10120                         SpinLockAcquire(&XLogCtl->info_lck);
10121                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10122                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10123                         SpinLockRelease(&XLogCtl->info_lck);
10124                 }
10125
10126                 /* Keep track of full_page_writes */
10127                 lastFullPageWrites = fpw;
10128         }
10129 }
10130
10131 #ifdef WAL_DEBUG
10132
10133 static void
10134 xlog_outrec(StringInfo buf, XLogReaderState *record)
10135 {
10136         int                     block_id;
10137
10138         appendStringInfo(buf, "prev %X/%X; xid %u",
10139                                          (uint32) (XLogRecGetPrev(record) >> 32),
10140                                          (uint32) XLogRecGetPrev(record),
10141                                          XLogRecGetXid(record));
10142
10143         appendStringInfo(buf, "; len %u",
10144                                          XLogRecGetDataLen(record));
10145
10146         /* decode block references */
10147         for (block_id = 0; block_id <= record->max_block_id; block_id++)
10148         {
10149                 RelFileNode rnode;
10150                 ForkNumber      forknum;
10151                 BlockNumber blk;
10152
10153                 if (!XLogRecHasBlockRef(record, block_id))
10154                         continue;
10155
10156                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10157                 if (forknum != MAIN_FORKNUM)
10158                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10159                                                          block_id,
10160                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
10161                                                          forknum,
10162                                                          blk);
10163                 else
10164                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10165                                                          block_id,
10166                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
10167                                                          blk);
10168                 if (XLogRecHasBlockImage(record, block_id))
10169                         appendStringInfoString(buf, " FPW");
10170         }
10171 }
10172 #endif                                                  /* WAL_DEBUG */
10173
10174 /*
10175  * Returns a string describing an XLogRecord, consisting of its identity
10176  * optionally followed by a colon, a space, and a further description.
10177  */
10178 static void
10179 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10180 {
10181         RmgrId          rmid = XLogRecGetRmid(record);
10182         uint8           info = XLogRecGetInfo(record);
10183         const char *id;
10184
10185         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10186         appendStringInfoChar(buf, '/');
10187
10188         id = RmgrTable[rmid].rm_identify(info);
10189         if (id == NULL)
10190                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10191         else
10192                 appendStringInfo(buf, "%s: ", id);
10193
10194         RmgrTable[rmid].rm_desc(buf, record);
10195 }
10196
10197
10198 /*
10199  * Return the (possible) sync flag used for opening a file, depending on the
10200  * value of the GUC wal_sync_method.
10201  */
10202 static int
10203 get_sync_bit(int method)
10204 {
10205         int                     o_direct_flag = 0;
10206
10207         /* If fsync is disabled, never open in sync mode */
10208         if (!enableFsync)
10209                 return 0;
10210
10211         /*
10212          * Optimize writes by bypassing kernel cache with O_DIRECT when using
10213          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
10214          * disabled, otherwise the archive command or walsender process will read
10215          * the WAL soon after writing it, which is guaranteed to cause a physical
10216          * read if we bypassed the kernel cache. We also skip the
10217          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10218          * reason.
10219          *
10220          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10221          * written by walreceiver is normally read by the startup process soon
10222          * after its written. Also, walreceiver performs unaligned writes, which
10223          * don't work with O_DIRECT, so it is required for correctness too.
10224          */
10225         if (!XLogIsNeeded() && !AmWalReceiverProcess())
10226                 o_direct_flag = PG_O_DIRECT;
10227
10228         switch (method)
10229         {
10230                         /*
10231                          * enum values for all sync options are defined even if they are
10232                          * not supported on the current platform.  But if not, they are
10233                          * not included in the enum option array, and therefore will never
10234                          * be seen here.
10235                          */
10236                 case SYNC_METHOD_FSYNC:
10237                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10238                 case SYNC_METHOD_FDATASYNC:
10239                         return 0;
10240 #ifdef OPEN_SYNC_FLAG
10241                 case SYNC_METHOD_OPEN:
10242                         return OPEN_SYNC_FLAG | o_direct_flag;
10243 #endif
10244 #ifdef OPEN_DATASYNC_FLAG
10245                 case SYNC_METHOD_OPEN_DSYNC:
10246                         return OPEN_DATASYNC_FLAG | o_direct_flag;
10247 #endif
10248                 default:
10249                         /* can't happen (unless we are out of sync with option array) */
10250                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
10251                         return 0;                       /* silence warning */
10252         }
10253 }
10254
10255 /*
10256  * GUC support
10257  */
10258 void
10259 assign_xlog_sync_method(int new_sync_method, void *extra)
10260 {
10261         if (sync_method != new_sync_method)
10262         {
10263                 /*
10264                  * To ensure that no blocks escape unsynced, force an fsync on the
10265                  * currently open log segment (if any).  Also, if the open flag is
10266                  * changing, close the log file so it will be reopened (with new flag
10267                  * bit) at next use.
10268                  */
10269                 if (openLogFile >= 0)
10270                 {
10271                         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10272                         if (pg_fsync(openLogFile) != 0)
10273                                 ereport(PANIC,
10274                                                 (errcode_for_file_access(),
10275                                                  errmsg("could not fsync file \"%s\": %m",
10276                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10277                         pgstat_report_wait_end();
10278                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10279                                 XLogFileClose();
10280                 }
10281         }
10282 }
10283
10284
10285 /*
10286  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10287  *
10288  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10289  * 'log' and 'seg' are for error reporting purposes.
10290  */
10291 void
10292 issue_xlog_fsync(int fd, XLogSegNo segno)
10293 {
10294         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10295         switch (sync_method)
10296         {
10297                 case SYNC_METHOD_FSYNC:
10298                         if (pg_fsync_no_writethrough(fd) != 0)
10299                                 ereport(PANIC,
10300                                                 (errcode_for_file_access(),
10301                                                  errmsg("could not fsync file \"%s\": %m",
10302                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10303                         break;
10304 #ifdef HAVE_FSYNC_WRITETHROUGH
10305                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10306                         if (pg_fsync_writethrough(fd) != 0)
10307                                 ereport(PANIC,
10308                                                 (errcode_for_file_access(),
10309                                                  errmsg("could not fsync write-through file \"%s\": %m",
10310                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10311                         break;
10312 #endif
10313 #ifdef HAVE_FDATASYNC
10314                 case SYNC_METHOD_FDATASYNC:
10315                         if (pg_fdatasync(fd) != 0)
10316                                 ereport(PANIC,
10317                                                 (errcode_for_file_access(),
10318                                                  errmsg("could not fdatasync file \"%s\": %m",
10319                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10320                         break;
10321 #endif
10322                 case SYNC_METHOD_OPEN:
10323                 case SYNC_METHOD_OPEN_DSYNC:
10324                         /* write synced it already */
10325                         break;
10326                 default:
10327                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10328                         break;
10329         }
10330         pgstat_report_wait_end();
10331 }
10332
10333 /*
10334  * Return the filename of given log segment, as a palloc'd string.
10335  */
10336 char *
10337 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10338 {
10339         char       *result = palloc(MAXFNAMELEN);
10340
10341         XLogFileName(result, tli, segno, wal_segment_size);
10342         return result;
10343 }
10344
10345 /*
10346  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10347  * function. It creates the necessary starting checkpoint and constructs the
10348  * backup label file.
10349  *
10350  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10351  * backup is started with pg_start_backup(), and there can be only one active
10352  * at a time. The backup and tablespace map files of an exclusive backup are
10353  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10354  * removed by pg_stop_backup().
10355  *
10356  * A non-exclusive backup is used for the streaming base backups (see
10357  * src/backend/replication/basebackup.c). The difference to exclusive backups
10358  * is that the backup label and tablespace map files are not written to disk.
10359  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10360  * and the caller is responsible for including them in the backup archive as
10361  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10362  * active at the same time, and they don't conflict with an exclusive backup
10363  * either.
10364  *
10365  * tblspcmapfile is required mainly for tar format in windows as native windows
10366  * utilities are not able to create symlinks while extracting files from tar.
10367  * However for consistency, the same is used for all platforms.
10368  *
10369  * needtblspcmapfile is true for the cases (exclusive backup and for
10370  * non-exclusive backup only when tar format is used for taking backup)
10371  * when backup needs to generate tablespace_map file, it is used to
10372  * embed escape character before newline character in tablespace path.
10373  *
10374  * Returns the minimum WAL location that must be present to restore from this
10375  * backup, and the corresponding timeline ID in *starttli_p.
10376  *
10377  * Every successfully started non-exclusive backup must be stopped by calling
10378  * do_pg_stop_backup() or do_pg_abort_backup().
10379  *
10380  * It is the responsibility of the caller of this function to verify the
10381  * permissions of the calling user!
10382  */
10383 XLogRecPtr
10384 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10385                                    StringInfo labelfile, List **tablespaces,
10386                                    StringInfo tblspcmapfile, bool infotbssize,
10387                                    bool needtblspcmapfile)
10388 {
10389         bool            exclusive = (labelfile == NULL);
10390         bool            backup_started_in_recovery = false;
10391         XLogRecPtr      checkpointloc;
10392         XLogRecPtr      startpoint;
10393         TimeLineID      starttli;
10394         pg_time_t       stamp_time;
10395         char            strfbuf[128];
10396         char            xlogfilename[MAXFNAMELEN];
10397         XLogSegNo       _logSegNo;
10398         struct stat stat_buf;
10399         FILE       *fp;
10400
10401         backup_started_in_recovery = RecoveryInProgress();
10402
10403         /*
10404          * Currently only non-exclusive backup can be taken during recovery.
10405          */
10406         if (backup_started_in_recovery && exclusive)
10407                 ereport(ERROR,
10408                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10409                                  errmsg("recovery is in progress"),
10410                                  errhint("WAL control functions cannot be executed during recovery.")));
10411
10412         /*
10413          * During recovery, we don't need to check WAL level. Because, if WAL
10414          * level is not sufficient, it's impossible to get here during recovery.
10415          */
10416         if (!backup_started_in_recovery && !XLogIsNeeded())
10417                 ereport(ERROR,
10418                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10419                                  errmsg("WAL level not sufficient for making an online backup"),
10420                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10421
10422         if (strlen(backupidstr) > MAXPGPATH)
10423                 ereport(ERROR,
10424                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10425                                  errmsg("backup label too long (max %d bytes)",
10426                                                 MAXPGPATH)));
10427
10428         /*
10429          * Mark backup active in shared memory.  We must do full-page WAL writes
10430          * during an on-line backup even if not doing so at other times, because
10431          * it's quite possible for the backup dump to obtain a "torn" (partially
10432          * written) copy of a database page if it reads the page concurrently with
10433          * our write to the same page.  This can be fixed as long as the first
10434          * write to the page in the WAL sequence is a full-page write. Hence, we
10435          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10436          * are no dirty pages in shared memory that might get dumped while the
10437          * backup is in progress without having a corresponding WAL record.  (Once
10438          * the backup is complete, we need not force full-page writes anymore,
10439          * since we expect that any pages not modified during the backup interval
10440          * must have been correctly captured by the backup.)
10441          *
10442          * Note that forcePageWrites has no effect during an online backup from
10443          * the standby.
10444          *
10445          * We must hold all the insertion locks to change the value of
10446          * forcePageWrites, to ensure adequate interlocking against
10447          * XLogInsertRecord().
10448          */
10449         WALInsertLockAcquireExclusive();
10450         if (exclusive)
10451         {
10452                 /*
10453                  * At first, mark that we're now starting an exclusive backup, to
10454                  * ensure that there are no other sessions currently running
10455                  * pg_start_backup() or pg_stop_backup().
10456                  */
10457                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10458                 {
10459                         WALInsertLockRelease();
10460                         ereport(ERROR,
10461                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10462                                          errmsg("a backup is already in progress"),
10463                                          errhint("Run pg_stop_backup() and try again.")));
10464                 }
10465                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10466         }
10467         else
10468                 XLogCtl->Insert.nonExclusiveBackups++;
10469         XLogCtl->Insert.forcePageWrites = true;
10470         WALInsertLockRelease();
10471
10472         /* Ensure we release forcePageWrites if fail below */
10473         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10474         {
10475                 bool            gotUniqueStartpoint = false;
10476                 DIR                *tblspcdir;
10477                 struct dirent *de;
10478                 tablespaceinfo *ti;
10479                 int                     datadirpathlen;
10480
10481                 /*
10482                  * Force an XLOG file switch before the checkpoint, to ensure that the
10483                  * WAL segment the checkpoint is written to doesn't contain pages with
10484                  * old timeline IDs.  That would otherwise happen if you called
10485                  * pg_start_backup() right after restoring from a PITR archive: the
10486                  * first WAL segment containing the startup checkpoint has pages in
10487                  * the beginning with the old timeline ID.  That can cause trouble at
10488                  * recovery: we won't have a history file covering the old timeline if
10489                  * pg_wal directory was not included in the base backup and the WAL
10490                  * archive was cleared too before starting the backup.
10491                  *
10492                  * This also ensures that we have emitted a WAL page header that has
10493                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10494                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
10495                  * compress out removable backup blocks, it won't remove any that
10496                  * occur after this point.
10497                  *
10498                  * During recovery, we skip forcing XLOG file switch, which means that
10499                  * the backup taken during recovery is not available for the special
10500                  * recovery case described above.
10501                  */
10502                 if (!backup_started_in_recovery)
10503                         RequestXLogSwitch(false);
10504
10505                 do
10506                 {
10507                         bool            checkpointfpw;
10508
10509                         /*
10510                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10511                          * page problems, this guarantees that two successive backup runs
10512                          * will have different checkpoint positions and hence different
10513                          * history file names, even if nothing happened in between.
10514                          *
10515                          * During recovery, establish a restartpoint if possible. We use
10516                          * the last restartpoint as the backup starting checkpoint. This
10517                          * means that two successive backup runs can have same checkpoint
10518                          * positions.
10519                          *
10520                          * Since the fact that we are executing do_pg_start_backup()
10521                          * during recovery means that checkpointer is running, we can use
10522                          * RequestCheckpoint() to establish a restartpoint.
10523                          *
10524                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10525                          * passing fast = true).  Otherwise this can take awhile.
10526                          */
10527                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10528                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10529
10530                         /*
10531                          * Now we need to fetch the checkpoint record location, and also
10532                          * its REDO pointer.  The oldest point in WAL that would be needed
10533                          * to restore starting from the checkpoint is precisely the REDO
10534                          * pointer.
10535                          */
10536                         LWLockAcquire(ControlFileLock, LW_SHARED);
10537                         checkpointloc = ControlFile->checkPoint;
10538                         startpoint = ControlFile->checkPointCopy.redo;
10539                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10540                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10541                         LWLockRelease(ControlFileLock);
10542
10543                         if (backup_started_in_recovery)
10544                         {
10545                                 XLogRecPtr      recptr;
10546
10547                                 /*
10548                                  * Check to see if all WAL replayed during online backup
10549                                  * (i.e., since last restartpoint used as backup starting
10550                                  * checkpoint) contain full-page writes.
10551                                  */
10552                                 SpinLockAcquire(&XLogCtl->info_lck);
10553                                 recptr = XLogCtl->lastFpwDisableRecPtr;
10554                                 SpinLockRelease(&XLogCtl->info_lck);
10555
10556                                 if (!checkpointfpw || startpoint <= recptr)
10557                                         ereport(ERROR,
10558                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10559                                                          errmsg("WAL generated with full_page_writes=off was replayed "
10560                                                                         "since last restartpoint"),
10561                                                          errhint("This means that the backup being taken on the standby "
10562                                                                          "is corrupt and should not be used. "
10563                                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
10564                                                                          "and then try an online backup again.")));
10565
10566                                 /*
10567                                  * During recovery, since we don't use the end-of-backup WAL
10568                                  * record and don't write the backup history file, the
10569                                  * starting WAL location doesn't need to be unique. This means
10570                                  * that two base backups started at the same time might use
10571                                  * the same checkpoint as starting locations.
10572                                  */
10573                                 gotUniqueStartpoint = true;
10574                         }
10575
10576                         /*
10577                          * If two base backups are started at the same time (in WAL sender
10578                          * processes), we need to make sure that they use different
10579                          * checkpoints as starting locations, because we use the starting
10580                          * WAL location as a unique identifier for the base backup in the
10581                          * end-of-backup WAL record and when we write the backup history
10582                          * file. Perhaps it would be better generate a separate unique ID
10583                          * for each backup instead of forcing another checkpoint, but
10584                          * taking a checkpoint right after another is not that expensive
10585                          * either because only few buffers have been dirtied yet.
10586                          */
10587                         WALInsertLockAcquireExclusive();
10588                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10589                         {
10590                                 XLogCtl->Insert.lastBackupStart = startpoint;
10591                                 gotUniqueStartpoint = true;
10592                         }
10593                         WALInsertLockRelease();
10594                 } while (!gotUniqueStartpoint);
10595
10596                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10597                 XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10598
10599                 /*
10600                  * Construct tablespace_map file
10601                  */
10602                 if (exclusive)
10603                         tblspcmapfile = makeStringInfo();
10604
10605                 datadirpathlen = strlen(DataDir);
10606
10607                 /* Collect information about all tablespaces */
10608                 tblspcdir = AllocateDir("pg_tblspc");
10609                 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10610                 {
10611                         char            fullpath[MAXPGPATH + 10];
10612                         char            linkpath[MAXPGPATH];
10613                         char       *relpath = NULL;
10614                         int                     rllen;
10615                         StringInfoData buflinkpath;
10616                         char       *s = linkpath;
10617
10618                         /* Skip special stuff */
10619                         if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10620                                 continue;
10621
10622                         snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10623
10624 #if defined(HAVE_READLINK) || defined(WIN32)
10625                         rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10626                         if (rllen < 0)
10627                         {
10628                                 ereport(WARNING,
10629                                                 (errmsg("could not read symbolic link \"%s\": %m",
10630                                                                 fullpath)));
10631                                 continue;
10632                         }
10633                         else if (rllen >= sizeof(linkpath))
10634                         {
10635                                 ereport(WARNING,
10636                                                 (errmsg("symbolic link \"%s\" target is too long",
10637                                                                 fullpath)));
10638                                 continue;
10639                         }
10640                         linkpath[rllen] = '\0';
10641
10642                         /*
10643                          * Add the escape character '\\' before newline in a string to
10644                          * ensure that we can distinguish between the newline in the
10645                          * tablespace path and end of line while reading tablespace_map
10646                          * file during archive recovery.
10647                          */
10648                         initStringInfo(&buflinkpath);
10649
10650                         while (*s)
10651                         {
10652                                 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10653                                         appendStringInfoChar(&buflinkpath, '\\');
10654                                 appendStringInfoChar(&buflinkpath, *s++);
10655                         }
10656
10657                         /*
10658                          * Relpath holds the relative path of the tablespace directory
10659                          * when it's located within PGDATA, or NULL if it's located
10660                          * elsewhere.
10661                          */
10662                         if (rllen > datadirpathlen &&
10663                                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10664                                 IS_DIR_SEP(linkpath[datadirpathlen]))
10665                                 relpath = linkpath + datadirpathlen + 1;
10666
10667                         ti = palloc(sizeof(tablespaceinfo));
10668                         ti->oid = pstrdup(de->d_name);
10669                         ti->path = pstrdup(buflinkpath.data);
10670                         ti->rpath = relpath ? pstrdup(relpath) : NULL;
10671                         ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10672
10673                         if (tablespaces)
10674                                 *tablespaces = lappend(*tablespaces, ti);
10675
10676                         appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10677
10678                         pfree(buflinkpath.data);
10679 #else
10680
10681                         /*
10682                          * If the platform does not have symbolic links, it should not be
10683                          * possible to have tablespaces - clearly somebody else created
10684                          * them. Warn about it and ignore.
10685                          */
10686                         ereport(WARNING,
10687                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10688                                          errmsg("tablespaces are not supported on this platform")));
10689 #endif
10690                 }
10691                 FreeDir(tblspcdir);
10692
10693                 /*
10694                  * Construct backup label file
10695                  */
10696                 if (exclusive)
10697                         labelfile = makeStringInfo();
10698
10699                 /* Use the log timezone here, not the session timezone */
10700                 stamp_time = (pg_time_t) time(NULL);
10701                 pg_strftime(strfbuf, sizeof(strfbuf),
10702                                         "%Y-%m-%d %H:%M:%S %Z",
10703                                         pg_localtime(&stamp_time, log_timezone));
10704                 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10705                                                  (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10706                 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10707                                                  (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10708                 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10709                                                  exclusive ? "pg_start_backup" : "streamed");
10710                 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10711                                                  backup_started_in_recovery ? "standby" : "master");
10712                 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10713                 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10714                 appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10715
10716                 /*
10717                  * Okay, write the file, or return its contents to caller.
10718                  */
10719                 if (exclusive)
10720                 {
10721                         /*
10722                          * Check for existing backup label --- implies a backup is already
10723                          * running.  (XXX given that we checked exclusiveBackupState
10724                          * above, maybe it would be OK to just unlink any such label
10725                          * file?)
10726                          */
10727                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10728                         {
10729                                 if (errno != ENOENT)
10730                                         ereport(ERROR,
10731                                                         (errcode_for_file_access(),
10732                                                          errmsg("could not stat file \"%s\": %m",
10733                                                                         BACKUP_LABEL_FILE)));
10734                         }
10735                         else
10736                                 ereport(ERROR,
10737                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10738                                                  errmsg("a backup is already in progress"),
10739                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10740                                                                  BACKUP_LABEL_FILE)));
10741
10742                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10743
10744                         if (!fp)
10745                                 ereport(ERROR,
10746                                                 (errcode_for_file_access(),
10747                                                  errmsg("could not create file \"%s\": %m",
10748                                                                 BACKUP_LABEL_FILE)));
10749                         if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10750                                 fflush(fp) != 0 ||
10751                                 pg_fsync(fileno(fp)) != 0 ||
10752                                 ferror(fp) ||
10753                                 FreeFile(fp))
10754                                 ereport(ERROR,
10755                                                 (errcode_for_file_access(),
10756                                                  errmsg("could not write file \"%s\": %m",
10757                                                                 BACKUP_LABEL_FILE)));
10758                         /* Allocated locally for exclusive backups, so free separately */
10759                         pfree(labelfile->data);
10760                         pfree(labelfile);
10761
10762                         /* Write backup tablespace_map file. */
10763                         if (tblspcmapfile->len > 0)
10764                         {
10765                                 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10766                                 {
10767                                         if (errno != ENOENT)
10768                                                 ereport(ERROR,
10769                                                                 (errcode_for_file_access(),
10770                                                                  errmsg("could not stat file \"%s\": %m",
10771                                                                                 TABLESPACE_MAP)));
10772                                 }
10773                                 else
10774                                         ereport(ERROR,
10775                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10776                                                          errmsg("a backup is already in progress"),
10777                                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10778                                                                          TABLESPACE_MAP)));
10779
10780                                 fp = AllocateFile(TABLESPACE_MAP, "w");
10781
10782                                 if (!fp)
10783                                         ereport(ERROR,
10784                                                         (errcode_for_file_access(),
10785                                                          errmsg("could not create file \"%s\": %m",
10786                                                                         TABLESPACE_MAP)));
10787                                 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10788                                         fflush(fp) != 0 ||
10789                                         pg_fsync(fileno(fp)) != 0 ||
10790                                         ferror(fp) ||
10791                                         FreeFile(fp))
10792                                         ereport(ERROR,
10793                                                         (errcode_for_file_access(),
10794                                                          errmsg("could not write file \"%s\": %m",
10795                                                                         TABLESPACE_MAP)));
10796                         }
10797
10798                         /* Allocated locally for exclusive backups, so free separately */
10799                         pfree(tblspcmapfile->data);
10800                         pfree(tblspcmapfile);
10801                 }
10802         }
10803         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10804
10805         /*
10806          * Mark that start phase has correctly finished for an exclusive backup.
10807          * Session-level locks are updated as well to reflect that state.
10808          *
10809          * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
10810          * counters and session-level lock. Otherwise they can be updated
10811          * inconsistently, and which might cause do_pg_abort_backup() to fail.
10812          */
10813         if (exclusive)
10814         {
10815                 WALInsertLockAcquireExclusive();
10816                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10817
10818                 /* Set session-level lock */
10819                 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10820                 WALInsertLockRelease();
10821         }
10822         else
10823                 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10824
10825         /*
10826          * We're done.  As a convenience, return the starting WAL location.
10827          */
10828         if (starttli_p)
10829                 *starttli_p = starttli;
10830         return startpoint;
10831 }
10832
10833 /* Error cleanup callback for pg_start_backup */
10834 static void
10835 pg_start_backup_callback(int code, Datum arg)
10836 {
10837         bool            exclusive = DatumGetBool(arg);
10838
10839         /* Update backup counters and forcePageWrites on failure */
10840         WALInsertLockAcquireExclusive();
10841         if (exclusive)
10842         {
10843                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10844                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10845         }
10846         else
10847         {
10848                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10849                 XLogCtl->Insert.nonExclusiveBackups--;
10850         }
10851
10852         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10853                 XLogCtl->Insert.nonExclusiveBackups == 0)
10854         {
10855                 XLogCtl->Insert.forcePageWrites = false;
10856         }
10857         WALInsertLockRelease();
10858 }
10859
10860 /*
10861  * Error cleanup callback for pg_stop_backup
10862  */
10863 static void
10864 pg_stop_backup_callback(int code, Datum arg)
10865 {
10866         bool            exclusive = DatumGetBool(arg);
10867
10868         /* Update backup status on failure */
10869         WALInsertLockAcquireExclusive();
10870         if (exclusive)
10871         {
10872                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10873                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10874         }
10875         WALInsertLockRelease();
10876 }
10877
10878 /*
10879  * Utility routine to fetch the session-level status of a backup running.
10880  */
10881 SessionBackupState
10882 get_backup_status(void)
10883 {
10884         return sessionBackupState;
10885 }
10886
10887 /*
10888  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10889  * function.
10890  *
10891  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10892  * the non-exclusive backup specified by 'labelfile'.
10893  *
10894  * Returns the last WAL location that must be present to restore from this
10895  * backup, and the corresponding timeline ID in *stoptli_p.
10896  *
10897  * It is the responsibility of the caller of this function to verify the
10898  * permissions of the calling user!
10899  */
10900 XLogRecPtr
10901 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10902 {
10903         bool            exclusive = (labelfile == NULL);
10904         bool            backup_started_in_recovery = false;
10905         XLogRecPtr      startpoint;
10906         XLogRecPtr      stoppoint;
10907         TimeLineID      stoptli;
10908         pg_time_t       stamp_time;
10909         char            strfbuf[128];
10910         char            histfilepath[MAXPGPATH];
10911         char            startxlogfilename[MAXFNAMELEN];
10912         char            stopxlogfilename[MAXFNAMELEN];
10913         char            lastxlogfilename[MAXFNAMELEN];
10914         char            histfilename[MAXFNAMELEN];
10915         char            backupfrom[20];
10916         XLogSegNo       _logSegNo;
10917         FILE       *lfp;
10918         FILE       *fp;
10919         char            ch;
10920         int                     seconds_before_warning;
10921         int                     waits = 0;
10922         bool            reported_waiting = false;
10923         char       *remaining;
10924         char       *ptr;
10925         uint32          hi,
10926                                 lo;
10927
10928         backup_started_in_recovery = RecoveryInProgress();
10929
10930         /*
10931          * Currently only non-exclusive backup can be taken during recovery.
10932          */
10933         if (backup_started_in_recovery && exclusive)
10934                 ereport(ERROR,
10935                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10936                                  errmsg("recovery is in progress"),
10937                                  errhint("WAL control functions cannot be executed during recovery.")));
10938
10939         /*
10940          * During recovery, we don't need to check WAL level. Because, if WAL
10941          * level is not sufficient, it's impossible to get here during recovery.
10942          */
10943         if (!backup_started_in_recovery && !XLogIsNeeded())
10944                 ereport(ERROR,
10945                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10946                                  errmsg("WAL level not sufficient for making an online backup"),
10947                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10948
10949         if (exclusive)
10950         {
10951                 /*
10952                  * At first, mark that we're now stopping an exclusive backup, to
10953                  * ensure that there are no other sessions currently running
10954                  * pg_start_backup() or pg_stop_backup().
10955                  */
10956                 WALInsertLockAcquireExclusive();
10957                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10958                 {
10959                         WALInsertLockRelease();
10960                         ereport(ERROR,
10961                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10962                                          errmsg("exclusive backup not in progress")));
10963                 }
10964                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10965                 WALInsertLockRelease();
10966
10967                 /*
10968                  * Remove backup_label. In case of failure, the state for an exclusive
10969                  * backup is switched back to in-progress.
10970                  */
10971                 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10972                 {
10973                         /*
10974                          * Read the existing label file into memory.
10975                          */
10976                         struct stat statbuf;
10977                         int                     r;
10978
10979                         if (stat(BACKUP_LABEL_FILE, &statbuf))
10980                         {
10981                                 /* should not happen per the upper checks */
10982                                 if (errno != ENOENT)
10983                                         ereport(ERROR,
10984                                                         (errcode_for_file_access(),
10985                                                          errmsg("could not stat file \"%s\": %m",
10986                                                                         BACKUP_LABEL_FILE)));
10987                                 ereport(ERROR,
10988                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10989                                                  errmsg("a backup is not in progress")));
10990                         }
10991
10992                         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10993                         if (!lfp)
10994                         {
10995                                 ereport(ERROR,
10996                                                 (errcode_for_file_access(),
10997                                                  errmsg("could not read file \"%s\": %m",
10998                                                                 BACKUP_LABEL_FILE)));
10999                         }
11000                         labelfile = palloc(statbuf.st_size + 1);
11001                         r = fread(labelfile, statbuf.st_size, 1, lfp);
11002                         labelfile[statbuf.st_size] = '\0';
11003
11004                         /*
11005                          * Close and remove the backup label file
11006                          */
11007                         if (r != 1 || ferror(lfp) || FreeFile(lfp))
11008                                 ereport(ERROR,
11009                                                 (errcode_for_file_access(),
11010                                                  errmsg("could not read file \"%s\": %m",
11011                                                                 BACKUP_LABEL_FILE)));
11012                         durable_unlink(BACKUP_LABEL_FILE, ERROR);
11013
11014                         /*
11015                          * Remove tablespace_map file if present, it is created only if
11016                          * there are tablespaces.
11017                          */
11018                         durable_unlink(TABLESPACE_MAP, DEBUG1);
11019                 }
11020                 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11021         }
11022
11023         /*
11024          * OK to update backup counters, forcePageWrites and session-level lock.
11025          *
11026          * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11027          * Otherwise they can be updated inconsistently, and which might cause
11028          * do_pg_abort_backup() to fail.
11029          */
11030         WALInsertLockAcquireExclusive();
11031         if (exclusive)
11032         {
11033                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11034         }
11035         else
11036         {
11037                 /*
11038                  * The user-visible pg_start/stop_backup() functions that operate on
11039                  * exclusive backups can be called at any time, but for non-exclusive
11040                  * backups, it is expected that each do_pg_start_backup() call is
11041                  * matched by exactly one do_pg_stop_backup() call.
11042                  */
11043                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11044                 XLogCtl->Insert.nonExclusiveBackups--;
11045         }
11046
11047         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11048                 XLogCtl->Insert.nonExclusiveBackups == 0)
11049         {
11050                 XLogCtl->Insert.forcePageWrites = false;
11051         }
11052
11053         /*
11054          * Clean up session-level lock.
11055          *
11056          * You might think that WALInsertLockRelease() can be called before
11057          * cleaning up session-level lock because session-level lock doesn't need
11058          * to be protected with WAL insertion lock. But since
11059          * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11060          * cleaned up before it.
11061          */
11062         sessionBackupState = SESSION_BACKUP_NONE;
11063
11064         WALInsertLockRelease();
11065
11066         /*
11067          * Read and parse the START WAL LOCATION line (this code is pretty crude,
11068          * but we are not expecting any variability in the file format).
11069          */
11070         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11071                            &hi, &lo, startxlogfilename,
11072                            &ch) != 4 || ch != '\n')
11073                 ereport(ERROR,
11074                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11075                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11076         startpoint = ((uint64) hi) << 32 | lo;
11077         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
11078
11079         /*
11080          * Parse the BACKUP FROM line. If we are taking an online backup from the
11081          * standby, we confirm that the standby has not been promoted during the
11082          * backup.
11083          */
11084         ptr = strstr(remaining, "BACKUP FROM:");
11085         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11086                 ereport(ERROR,
11087                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11088                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11089         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11090                 ereport(ERROR,
11091                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11092                                  errmsg("the standby was promoted during online backup"),
11093                                  errhint("This means that the backup being taken is corrupt "
11094                                                  "and should not be used. "
11095                                                  "Try taking another online backup.")));
11096
11097         /*
11098          * During recovery, we don't write an end-of-backup record. We assume that
11099          * pg_control was backed up last and its minimum recovery point can be
11100          * available as the backup end location. Since we don't have an
11101          * end-of-backup record, we use the pg_control value to check whether
11102          * we've reached the end of backup when starting recovery from this
11103          * backup. We have no way of checking if pg_control wasn't backed up last
11104          * however.
11105          *
11106          * We don't force a switch to new WAL file but it is still possible to
11107          * wait for all the required files to be archived if waitforarchive is
11108          * true. This is okay if we use the backup to start a standby and fetch
11109          * the missing WAL using streaming replication. But in the case of an
11110          * archive recovery, a user should set waitforarchive to true and wait for
11111          * them to be archived to ensure that all the required files are
11112          * available.
11113          *
11114          * We return the current minimum recovery point as the backup end
11115          * location. Note that it can be greater than the exact backup end
11116          * location if the minimum recovery point is updated after the backup of
11117          * pg_control. This is harmless for current uses.
11118          *
11119          * XXX currently a backup history file is for informational and debug
11120          * purposes only. It's not essential for an online backup. Furthermore,
11121          * even if it's created, it will not be archived during recovery because
11122          * an archiver is not invoked. So it doesn't seem worthwhile to write a
11123          * backup history file during recovery.
11124          */
11125         if (backup_started_in_recovery)
11126         {
11127                 XLogRecPtr      recptr;
11128
11129                 /*
11130                  * Check to see if all WAL replayed during online backup contain
11131                  * full-page writes.
11132                  */
11133                 SpinLockAcquire(&XLogCtl->info_lck);
11134                 recptr = XLogCtl->lastFpwDisableRecPtr;
11135                 SpinLockRelease(&XLogCtl->info_lck);
11136
11137                 if (startpoint <= recptr)
11138                         ereport(ERROR,
11139                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11140                                          errmsg("WAL generated with full_page_writes=off was replayed "
11141                                                         "during online backup"),
11142                                          errhint("This means that the backup being taken on the standby "
11143                                                          "is corrupt and should not be used. "
11144                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
11145                                                          "and then try an online backup again.")));
11146
11147
11148                 LWLockAcquire(ControlFileLock, LW_SHARED);
11149                 stoppoint = ControlFile->minRecoveryPoint;
11150                 stoptli = ControlFile->minRecoveryPointTLI;
11151                 LWLockRelease(ControlFileLock);
11152         }
11153         else
11154         {
11155                 /*
11156                  * Write the backup-end xlog record
11157                  */
11158                 XLogBeginInsert();
11159                 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11160                 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11161                 stoptli = ThisTimeLineID;
11162
11163                 /*
11164                  * Force a switch to a new xlog segment file, so that the backup is
11165                  * valid as soon as archiver moves out the current segment file.
11166                  */
11167                 RequestXLogSwitch(false);
11168
11169                 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11170                 XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11171
11172                 /* Use the log timezone here, not the session timezone */
11173                 stamp_time = (pg_time_t) time(NULL);
11174                 pg_strftime(strfbuf, sizeof(strfbuf),
11175                                         "%Y-%m-%d %H:%M:%S %Z",
11176                                         pg_localtime(&stamp_time, log_timezone));
11177
11178                 /*
11179                  * Write the backup history file
11180                  */
11181                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11182                 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11183                                                           startpoint, wal_segment_size);
11184                 fp = AllocateFile(histfilepath, "w");
11185                 if (!fp)
11186                         ereport(ERROR,
11187                                         (errcode_for_file_access(),
11188                                          errmsg("could not create file \"%s\": %m",
11189                                                         histfilepath)));
11190                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11191                                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11192                 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11193                                 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11194
11195                 /*
11196                  * Transfer remaining lines including label and start timeline to
11197                  * history file.
11198                  */
11199                 fprintf(fp, "%s", remaining);
11200                 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11201                 fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11202                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11203                         ereport(ERROR,
11204                                         (errcode_for_file_access(),
11205                                          errmsg("could not write file \"%s\": %m",
11206                                                         histfilepath)));
11207
11208                 /*
11209                  * Clean out any no-longer-needed history files.  As a side effect,
11210                  * this will post a .ready file for the newly created history file,
11211                  * notifying the archiver that history file may be archived
11212                  * immediately.
11213                  */
11214                 CleanupBackupHistory();
11215         }
11216
11217         /*
11218          * If archiving is enabled, wait for all the required WAL files to be
11219          * archived before returning. If archiving isn't enabled, the required WAL
11220          * needs to be transported via streaming replication (hopefully with
11221          * wal_keep_segments set high enough), or some more exotic mechanism like
11222          * polling and copying files from pg_wal with script. We have no knowledge
11223          * of those mechanisms, so it's up to the user to ensure that he gets all
11224          * the required WAL.
11225          *
11226          * We wait until both the last WAL file filled during backup and the
11227          * history file have been archived, and assume that the alphabetic sorting
11228          * property of the WAL files ensures any earlier WAL files are safely
11229          * archived as well.
11230          *
11231          * We wait forever, since archive_command is supposed to work and we
11232          * assume the admin wanted his backup to work completely. If you don't
11233          * wish to wait, then either waitforarchive should be passed in as false,
11234          * or you can set statement_timeout.  Also, some notices are issued to
11235          * clue in anyone who might be doing this interactively.
11236          */
11237
11238         if (waitforarchive &&
11239                 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11240                  (backup_started_in_recovery && XLogArchivingAlways())))
11241         {
11242                 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11243                 XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11244
11245                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11246                 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11247                                                           startpoint, wal_segment_size);
11248
11249                 seconds_before_warning = 60;
11250                 waits = 0;
11251
11252                 while (XLogArchiveIsBusy(lastxlogfilename) ||
11253                            XLogArchiveIsBusy(histfilename))
11254                 {
11255                         CHECK_FOR_INTERRUPTS();
11256
11257                         if (!reported_waiting && waits > 5)
11258                         {
11259                                 ereport(NOTICE,
11260                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
11261                                 reported_waiting = true;
11262                         }
11263
11264                         pg_usleep(1000000L);
11265
11266                         if (++waits >= seconds_before_warning)
11267                         {
11268                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
11269                                 ereport(WARNING,
11270                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11271                                                                 waits),
11272                                                  errhint("Check that your archive_command is executing properly.  "
11273                                                                  "pg_stop_backup can be canceled safely, "
11274                                                                  "but the database backup will not be usable without all the WAL segments.")));
11275                         }
11276                 }
11277
11278                 ereport(NOTICE,
11279                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11280         }
11281         else if (waitforarchive)
11282                 ereport(NOTICE,
11283                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11284
11285         /*
11286          * We're done.  As a convenience, return the ending WAL location.
11287          */
11288         if (stoptli_p)
11289                 *stoptli_p = stoptli;
11290         return stoppoint;
11291 }
11292
11293
11294 /*
11295  * do_pg_abort_backup: abort a running backup
11296  *
11297  * This does just the most basic steps of do_pg_stop_backup(), by taking the
11298  * system out of backup mode, thus making it a lot more safe to call from
11299  * an error handler.
11300  *
11301  * NB: This is only for aborting a non-exclusive backup that doesn't write
11302  * backup_label. A backup started with pg_start_backup() needs to be finished
11303  * with pg_stop_backup().
11304  */
11305 void
11306 do_pg_abort_backup(void)
11307 {
11308         /*
11309          * Quick exit if session is not keeping around a non-exclusive backup
11310          * already started.
11311          */
11312         if (sessionBackupState == SESSION_BACKUP_NONE)
11313                 return;
11314
11315         WALInsertLockAcquireExclusive();
11316         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11317         Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11318         XLogCtl->Insert.nonExclusiveBackups--;
11319
11320         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11321                 XLogCtl->Insert.nonExclusiveBackups == 0)
11322         {
11323                 XLogCtl->Insert.forcePageWrites = false;
11324         }
11325         WALInsertLockRelease();
11326 }
11327
11328 /*
11329  * Get latest redo apply position.
11330  *
11331  * Exported to allow WALReceiver to read the pointer directly.
11332  */
11333 XLogRecPtr
11334 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11335 {
11336         XLogRecPtr      recptr;
11337         TimeLineID      tli;
11338
11339         SpinLockAcquire(&XLogCtl->info_lck);
11340         recptr = XLogCtl->lastReplayedEndRecPtr;
11341         tli = XLogCtl->lastReplayedTLI;
11342         SpinLockRelease(&XLogCtl->info_lck);
11343
11344         if (replayTLI)
11345                 *replayTLI = tli;
11346         return recptr;
11347 }
11348
11349 /*
11350  * Get latest WAL insert pointer
11351  */
11352 XLogRecPtr
11353 GetXLogInsertRecPtr(void)
11354 {
11355         XLogCtlInsert *Insert = &XLogCtl->Insert;
11356         uint64          current_bytepos;
11357
11358         SpinLockAcquire(&Insert->insertpos_lck);
11359         current_bytepos = Insert->CurrBytePos;
11360         SpinLockRelease(&Insert->insertpos_lck);
11361
11362         return XLogBytePosToRecPtr(current_bytepos);
11363 }
11364
11365 /*
11366  * Get latest WAL write pointer
11367  */
11368 XLogRecPtr
11369 GetXLogWriteRecPtr(void)
11370 {
11371         SpinLockAcquire(&XLogCtl->info_lck);
11372         LogwrtResult = XLogCtl->LogwrtResult;
11373         SpinLockRelease(&XLogCtl->info_lck);
11374
11375         return LogwrtResult.Write;
11376 }
11377
11378 /*
11379  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11380  * the oldest point in WAL that we still need, if we have to restart recovery.
11381  */
11382 void
11383 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11384 {
11385         LWLockAcquire(ControlFileLock, LW_SHARED);
11386         *oldrecptr = ControlFile->checkPointCopy.redo;
11387         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11388         LWLockRelease(ControlFileLock);
11389 }
11390
11391 /*
11392  * read_backup_label: check to see if a backup_label file is present
11393  *
11394  * If we see a backup_label during recovery, we assume that we are recovering
11395  * from a backup dump file, and we therefore roll forward from the checkpoint
11396  * identified by the label file, NOT what pg_control says.  This avoids the
11397  * problem that pg_control might have been archived one or more checkpoints
11398  * later than the start of the dump, and so if we rely on it as the start
11399  * point, we will fail to restore a consistent database state.
11400  *
11401  * Returns true if a backup_label was found (and fills the checkpoint
11402  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11403  * respectively); returns false if not. If this backup_label came from a
11404  * streamed backup, *backupEndRequired is set to true. If this backup_label
11405  * was created during recovery, *backupFromStandby is set to true.
11406  */
11407 static bool
11408 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11409                                   bool *backupFromStandby)
11410 {
11411         char            startxlogfilename[MAXFNAMELEN];
11412         TimeLineID      tli_from_walseg,
11413                                 tli_from_file;
11414         FILE       *lfp;
11415         char            ch;
11416         char            backuptype[20];
11417         char            backupfrom[20];
11418         char            backuplabel[MAXPGPATH];
11419         char            backuptime[128];
11420         uint32          hi,
11421                                 lo;
11422
11423         *backupEndRequired = false;
11424         *backupFromStandby = false;
11425
11426         /*
11427          * See if label file is present
11428          */
11429         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11430         if (!lfp)
11431         {
11432                 if (errno != ENOENT)
11433                         ereport(FATAL,
11434                                         (errcode_for_file_access(),
11435                                          errmsg("could not read file \"%s\": %m",
11436                                                         BACKUP_LABEL_FILE)));
11437                 return false;                   /* it's not there, all is fine */
11438         }
11439
11440         /*
11441          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11442          * is pretty crude, but we are not expecting any variability in the file
11443          * format).
11444          */
11445         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11446                            &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11447                 ereport(FATAL,
11448                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11449                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11450         RedoStartLSN = ((uint64) hi) << 32 | lo;
11451         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11452                            &hi, &lo, &ch) != 3 || ch != '\n')
11453                 ereport(FATAL,
11454                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11455                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11456         *checkPointLoc = ((uint64) hi) << 32 | lo;
11457
11458         /*
11459          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11460          * from an older backup anyway, but since the information on it is not
11461          * strictly required, don't error out if it's missing for some reason.
11462          */
11463         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11464         {
11465                 if (strcmp(backuptype, "streamed") == 0)
11466                         *backupEndRequired = true;
11467         }
11468
11469         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11470         {
11471                 if (strcmp(backupfrom, "standby") == 0)
11472                         *backupFromStandby = true;
11473         }
11474
11475         /*
11476          * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11477          * but checking for their presence is useful for debugging and the next
11478          * sanity checks. Cope also with the fact that the result buffers have a
11479          * pre-allocated size, hence if the backup_label file has been generated
11480          * with strings longer than the maximum assumed here an incorrect parsing
11481          * happens. That's fine as only minor consistency checks are done
11482          * afterwards.
11483          */
11484         if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11485                 ereport(DEBUG1,
11486                                 (errmsg("backup time %s in file \"%s\"",
11487                                                 backuptime, BACKUP_LABEL_FILE)));
11488
11489         if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11490                 ereport(DEBUG1,
11491                                 (errmsg("backup label %s in file \"%s\"",
11492                                                 backuplabel, BACKUP_LABEL_FILE)));
11493
11494         /*
11495          * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11496          * it as a sanity check if present.
11497          */
11498         if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11499         {
11500                 if (tli_from_walseg != tli_from_file)
11501                         ereport(FATAL,
11502                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11503                                          errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11504                                          errdetail("Timeline ID parsed is %u, but expected %u",
11505                                                            tli_from_file, tli_from_walseg)));
11506
11507                 ereport(DEBUG1,
11508                                 (errmsg("backup timeline %u in file \"%s\"",
11509                                                 tli_from_file, BACKUP_LABEL_FILE)));
11510         }
11511
11512         if (ferror(lfp) || FreeFile(lfp))
11513                 ereport(FATAL,
11514                                 (errcode_for_file_access(),
11515                                  errmsg("could not read file \"%s\": %m",
11516                                                 BACKUP_LABEL_FILE)));
11517
11518         return true;
11519 }
11520
11521 /*
11522  * read_tablespace_map: check to see if a tablespace_map file is present
11523  *
11524  * If we see a tablespace_map file during recovery, we assume that we are
11525  * recovering from a backup dump file, and we therefore need to create symlinks
11526  * as per the information present in tablespace_map file.
11527  *
11528  * Returns true if a tablespace_map file was found (and fills the link
11529  * information for all the tablespace links present in file); returns false
11530  * if not.
11531  */
11532 static bool
11533 read_tablespace_map(List **tablespaces)
11534 {
11535         tablespaceinfo *ti;
11536         FILE       *lfp;
11537         char            tbsoid[MAXPGPATH];
11538         char       *tbslinkpath;
11539         char            str[MAXPGPATH];
11540         int                     ch,
11541                                 prev_ch = -1,
11542                                 i = 0,
11543                                 n;
11544
11545         /*
11546          * See if tablespace_map file is present
11547          */
11548         lfp = AllocateFile(TABLESPACE_MAP, "r");
11549         if (!lfp)
11550         {
11551                 if (errno != ENOENT)
11552                         ereport(FATAL,
11553                                         (errcode_for_file_access(),
11554                                          errmsg("could not read file \"%s\": %m",
11555                                                         TABLESPACE_MAP)));
11556                 return false;                   /* it's not there, all is fine */
11557         }
11558
11559         /*
11560          * Read and parse the link name and path lines from tablespace_map file
11561          * (this code is pretty crude, but we are not expecting any variability in
11562          * the file format).  While taking backup we embed escape character '\\'
11563          * before newline in tablespace path, so that during reading of
11564          * tablespace_map file, we could distinguish newline in tablespace path
11565          * and end of line.  Now while reading tablespace_map file, remove the
11566          * escape character that has been added in tablespace path during backup.
11567          */
11568         while ((ch = fgetc(lfp)) != EOF)
11569         {
11570                 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11571                 {
11572                         str[i] = '\0';
11573                         if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11574                                 ereport(FATAL,
11575                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11576                                                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11577                         tbslinkpath = str + n;
11578                         i = 0;
11579
11580                         ti = palloc(sizeof(tablespaceinfo));
11581                         ti->oid = pstrdup(tbsoid);
11582                         ti->path = pstrdup(tbslinkpath);
11583
11584                         *tablespaces = lappend(*tablespaces, ti);
11585                         continue;
11586                 }
11587                 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11588                         str[i - 1] = ch;
11589                 else
11590                         str[i++] = ch;
11591                 prev_ch = ch;
11592         }
11593
11594         if (ferror(lfp) || FreeFile(lfp))
11595                 ereport(FATAL,
11596                                 (errcode_for_file_access(),
11597                                  errmsg("could not read file \"%s\": %m",
11598                                                 TABLESPACE_MAP)));
11599
11600         return true;
11601 }
11602
11603 /*
11604  * Error context callback for errors occurring during rm_redo().
11605  */
11606 static void
11607 rm_redo_error_callback(void *arg)
11608 {
11609         XLogReaderState *record = (XLogReaderState *) arg;
11610         StringInfoData buf;
11611
11612         initStringInfo(&buf);
11613         xlog_outdesc(&buf, record);
11614
11615         /* translator: %s is a WAL record description */
11616         errcontext("WAL redo at %X/%X for %s",
11617                            (uint32) (record->ReadRecPtr >> 32),
11618                            (uint32) record->ReadRecPtr,
11619                            buf.data);
11620
11621         pfree(buf.data);
11622 }
11623
11624 /*
11625  * BackupInProgress: check if online backup mode is active
11626  *
11627  * This is done by checking for existence of the "backup_label" file.
11628  */
11629 bool
11630 BackupInProgress(void)
11631 {
11632         struct stat stat_buf;
11633
11634         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11635 }
11636
11637 /*
11638  * CancelBackup: rename the "backup_label" and "tablespace_map"
11639  *                               files to cancel backup mode
11640  *
11641  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11642  * Similarly, if the "tablespace_map" file exists, it will be renamed to
11643  * "tablespace_map.old".
11644  *
11645  * Note that this will render an online backup in progress
11646  * useless. To correctly finish an online backup, pg_stop_backup must be
11647  * called.
11648  */
11649 void
11650 CancelBackup(void)
11651 {
11652         struct stat stat_buf;
11653
11654         /* if the backup_label file is not there, return */
11655         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11656                 return;
11657
11658         /* remove leftover file from previously canceled backup if it exists */
11659         unlink(BACKUP_LABEL_OLD);
11660
11661         if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11662         {
11663                 ereport(WARNING,
11664                                 (errcode_for_file_access(),
11665                                  errmsg("online backup mode was not canceled"),
11666                                  errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11667                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11668                 return;
11669         }
11670
11671         /* if the tablespace_map file is not there, return */
11672         if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11673         {
11674                 ereport(LOG,
11675                                 (errmsg("online backup mode canceled"),
11676                                  errdetail("File \"%s\" was renamed to \"%s\".",
11677                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11678                 return;
11679         }
11680
11681         /* remove leftover file from previously canceled backup if it exists */
11682         unlink(TABLESPACE_MAP_OLD);
11683
11684         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11685         {
11686                 ereport(LOG,
11687                                 (errmsg("online backup mode canceled"),
11688                                  errdetail("Files \"%s\" and \"%s\" were renamed to "
11689                                                    "\"%s\" and \"%s\", respectively.",
11690                                                    BACKUP_LABEL_FILE, TABLESPACE_MAP,
11691                                                    BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11692         }
11693         else
11694         {
11695                 ereport(WARNING,
11696                                 (errcode_for_file_access(),
11697                                  errmsg("online backup mode canceled"),
11698                                  errdetail("File \"%s\" was renamed to \"%s\", but "
11699                                                    "file \"%s\" could not be renamed to \"%s\": %m.",
11700                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11701                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11702         }
11703 }
11704
11705 /*
11706  * Read the XLOG page containing RecPtr into readBuf (if not read already).
11707  * Returns number of bytes read, if the page is read successfully, or -1
11708  * in case of errors.  When errors occur, they are ereport'ed, but only
11709  * if they have not been previously reported.
11710  *
11711  * This is responsible for restoring files from archive as needed, as well
11712  * as for waiting for the requested WAL record to arrive in standby mode.
11713  *
11714  * 'emode' specifies the log level used for reporting "file not found" or
11715  * "end of WAL" situations in archive recovery, or in standby mode when a
11716  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11717  * false in those situations, on higher log levels the ereport() won't
11718  * return.
11719  *
11720  * In standby mode, if after a successful return of XLogPageRead() the
11721  * caller finds the record it's interested in to be broken, it should
11722  * ereport the error with the level determined by
11723  * emode_for_corrupt_record(), and then set lastSourceFailed
11724  * and call XLogPageRead() again with the same arguments. This lets
11725  * XLogPageRead() to try fetching the record from another source, or to
11726  * sleep and retry.
11727  */
11728 static int
11729 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11730                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11731 {
11732         XLogPageReadPrivate *private =
11733         (XLogPageReadPrivate *) xlogreader->private_data;
11734         int                     emode = private->emode;
11735         uint32          targetPageOff;
11736         XLogSegNo       targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11737         int                     r;
11738
11739         XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
11740         targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
11741
11742         /*
11743          * See if we need to switch to a new segment because the requested record
11744          * is not in the currently open one.
11745          */
11746         if (readFile >= 0 &&
11747                 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
11748         {
11749                 /*
11750                  * Request a restartpoint if we've replayed too much xlog since the
11751                  * last one.
11752                  */
11753                 if (bgwriterLaunched)
11754                 {
11755                         if (XLogCheckpointNeeded(readSegNo))
11756                         {
11757                                 (void) GetRedoRecPtr();
11758                                 if (XLogCheckpointNeeded(readSegNo))
11759                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11760                         }
11761                 }
11762
11763                 close(readFile);
11764                 readFile = -1;
11765                 readSource = 0;
11766         }
11767
11768         XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
11769
11770 retry:
11771         /* See if we need to retrieve more data */
11772         if (readFile < 0 ||
11773                 (readSource == XLOG_FROM_STREAM &&
11774                  receivedUpto < targetPagePtr + reqLen))
11775         {
11776                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11777                                                                                  private->randAccess,
11778                                                                                  private->fetching_ckpt,
11779                                                                                  targetRecPtr))
11780                 {
11781                         if (readFile >= 0)
11782                                 close(readFile);
11783                         readFile = -1;
11784                         readLen = 0;
11785                         readSource = 0;
11786
11787                         return -1;
11788                 }
11789         }
11790
11791         /*
11792          * At this point, we have the right segment open and if we're streaming we
11793          * know the requested record is in it.
11794          */
11795         Assert(readFile != -1);
11796
11797         /*
11798          * If the current segment is being streamed from master, calculate how
11799          * much of the current page we have received already. We know the
11800          * requested record has been received, but this is for the benefit of
11801          * future calls, to allow quick exit at the top of this function.
11802          */
11803         if (readSource == XLOG_FROM_STREAM)
11804         {
11805                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11806                         readLen = XLOG_BLCKSZ;
11807                 else
11808                         readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
11809                                 targetPageOff;
11810         }
11811         else
11812                 readLen = XLOG_BLCKSZ;
11813
11814         /* Read the requested page */
11815         readOff = targetPageOff;
11816         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11817         {
11818                 char            fname[MAXFNAMELEN];
11819                 int                     save_errno = errno;
11820
11821                 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11822                 errno = save_errno;
11823                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11824                                 (errcode_for_file_access(),
11825                                  errmsg("could not seek in log segment %s to offset %u: %m",
11826                                                 fname, readOff)));
11827                 goto next_record_is_invalid;
11828         }
11829
11830         pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11831         r = read(readFile, readBuf, XLOG_BLCKSZ);
11832         if (r != XLOG_BLCKSZ)
11833         {
11834                 char            fname[MAXFNAMELEN];
11835                 int                     save_errno = errno;
11836
11837                 pgstat_report_wait_end();
11838                 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11839                 if (r < 0)
11840                 {
11841                         errno = save_errno;
11842                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11843                                         (errcode_for_file_access(),
11844                                          errmsg("could not read from log segment %s, offset %u: %m",
11845                                                         fname, readOff)));
11846                 }
11847                 else
11848                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11849                                         (errcode(ERRCODE_DATA_CORRUPTED),
11850                                          errmsg("could not read from log segment %s, offset %u: read %d of %zu",
11851                                                         fname, readOff, r, (Size) XLOG_BLCKSZ)));
11852                 goto next_record_is_invalid;
11853         }
11854         pgstat_report_wait_end();
11855
11856         Assert(targetSegNo == readSegNo);
11857         Assert(targetPageOff == readOff);
11858         Assert(reqLen <= readLen);
11859
11860         *readTLI = curFileTLI;
11861
11862         /*
11863          * Check the page header immediately, so that we can retry immediately if
11864          * it's not valid. This may seem unnecessary, because XLogReadRecord()
11865          * validates the page header anyway, and would propagate the failure up to
11866          * ReadRecord(), which would retry. However, there's a corner case with
11867          * continuation records, if a record is split across two pages such that
11868          * we would need to read the two pages from different sources. For
11869          * example, imagine a scenario where a streaming replica is started up,
11870          * and replay reaches a record that's split across two WAL segments. The
11871          * first page is only available locally, in pg_wal, because it's already
11872          * been recycled in the master. The second page, however, is not present
11873          * in pg_wal, and we should stream it from the master. There is a recycled
11874          * WAL segment present in pg_wal, with garbage contents, however. We would
11875          * read the first page from the local WAL segment, but when reading the
11876          * second page, we would read the bogus, recycled, WAL segment. If we
11877          * didn't catch that case here, we would never recover, because
11878          * ReadRecord() would retry reading the whole record from the beginning.
11879          *
11880          * Of course, this only catches errors in the page header, which is what
11881          * happens in the case of a recycled WAL segment. Other kinds of errors or
11882          * corruption still has the same problem. But this at least fixes the
11883          * common case, which can happen as part of normal operation.
11884          *
11885          * Validating the page header is cheap enough that doing it twice
11886          * shouldn't be a big deal from a performance point of view.
11887          */
11888         if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11889         {
11890                 /* reset any error XLogReaderValidatePageHeader() might have set */
11891                 xlogreader->errormsg_buf[0] = '\0';
11892                 goto next_record_is_invalid;
11893         }
11894
11895         return readLen;
11896
11897 next_record_is_invalid:
11898         lastSourceFailed = true;
11899
11900         if (readFile >= 0)
11901                 close(readFile);
11902         readFile = -1;
11903         readLen = 0;
11904         readSource = 0;
11905
11906         /* In standby-mode, keep trying */
11907         if (StandbyMode)
11908                 goto retry;
11909         else
11910                 return -1;
11911 }
11912
11913 /*
11914  * Open the WAL segment containing WAL location 'RecPtr'.
11915  *
11916  * The segment can be fetched via restore_command, or via walreceiver having
11917  * streamed the record, or it can already be present in pg_wal. Checking
11918  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11919  * too, in case someone copies a new segment directly to pg_wal. That is not
11920  * documented or recommended, though.
11921  *
11922  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11923  * prepare to read WAL starting from RedoStartLSN after this.
11924  *
11925  * 'RecPtr' might not point to the beginning of the record we're interested
11926  * in, it might also point to the page or segment header. In that case,
11927  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11928  * used to decide which timeline to stream the requested WAL from.
11929  *
11930  * If the record is not immediately available, the function returns false
11931  * if we're not in standby mode. In standby mode, waits for it to become
11932  * available.
11933  *
11934  * When the requested record becomes available, the function opens the file
11935  * containing it (if not open already), and returns true. When end of standby
11936  * mode is triggered by the user, and there is no more WAL available, returns
11937  * false.
11938  */
11939 static bool
11940 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11941                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11942 {
11943         static TimestampTz last_fail_time = 0;
11944         TimestampTz now;
11945         bool            streaming_reply_sent = false;
11946
11947         /*-------
11948          * Standby mode is implemented by a state machine:
11949          *
11950          * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11951          *        pg_wal (XLOG_FROM_PG_WAL)
11952          * 2. Check trigger file
11953          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11954          * 4. Rescan timelines
11955          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11956          *
11957          * Failure to read from the current source advances the state machine to
11958          * the next state.
11959          *
11960          * 'currentSource' indicates the current state. There are no currentSource
11961          * values for "check trigger", "rescan timelines", and "sleep" states,
11962          * those actions are taken when reading from the previous source fails, as
11963          * part of advancing to the next state.
11964          *-------
11965          */
11966         if (!InArchiveRecovery)
11967                 currentSource = XLOG_FROM_PG_WAL;
11968         else if (currentSource == 0)
11969                 currentSource = XLOG_FROM_ARCHIVE;
11970
11971         for (;;)
11972         {
11973                 int                     oldSource = currentSource;
11974
11975                 /*
11976                  * First check if we failed to read from the current source, and
11977                  * advance the state machine if so. The failure to read might've
11978                  * happened outside this function, e.g when a CRC check fails on a
11979                  * record, or within this loop.
11980                  */
11981                 if (lastSourceFailed)
11982                 {
11983                         switch (currentSource)
11984                         {
11985                                 case XLOG_FROM_ARCHIVE:
11986                                 case XLOG_FROM_PG_WAL:
11987
11988                                         /*
11989                                          * Check to see if the trigger file exists. Note that we
11990                                          * do this only after failure, so when you create the
11991                                          * trigger file, we still finish replaying as much as we
11992                                          * can from archive and pg_wal before failover.
11993                                          */
11994                                         if (StandbyMode && CheckForStandbyTrigger())
11995                                         {
11996                                                 ShutdownWalRcv();
11997                                                 return false;
11998                                         }
11999
12000                                         /*
12001                                          * Not in standby mode, and we've now tried the archive
12002                                          * and pg_wal.
12003                                          */
12004                                         if (!StandbyMode)
12005                                                 return false;
12006
12007                                         /*
12008                                          * If primary_conninfo is set, launch walreceiver to try
12009                                          * to stream the missing WAL.
12010                                          *
12011                                          * If fetching_ckpt is true, RecPtr points to the initial
12012                                          * checkpoint location. In that case, we use RedoStartLSN
12013                                          * as the streaming start position instead of RecPtr, so
12014                                          * that when we later jump backwards to start redo at
12015                                          * RedoStartLSN, we will have the logs streamed already.
12016                                          */
12017                                         if (PrimaryConnInfo)
12018                                         {
12019                                                 XLogRecPtr      ptr;
12020                                                 TimeLineID      tli;
12021
12022                                                 if (fetching_ckpt)
12023                                                 {
12024                                                         ptr = RedoStartLSN;
12025                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
12026                                                 }
12027                                                 else
12028                                                 {
12029                                                         ptr = RecPtr;
12030
12031                                                         /*
12032                                                          * Use the record begin position to determine the
12033                                                          * TLI, rather than the position we're reading.
12034                                                          */
12035                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12036
12037                                                         if (curFileTLI > 0 && tli < curFileTLI)
12038                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12039                                                                          (uint32) (tliRecPtr >> 32),
12040                                                                          (uint32) tliRecPtr,
12041                                                                          tli, curFileTLI);
12042                                                 }
12043                                                 curFileTLI = tli;
12044                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12045                                                                                          PrimarySlotName);
12046                                                 receivedUpto = 0;
12047                                         }
12048
12049                                         /*
12050                                          * Move to XLOG_FROM_STREAM state in either case. We'll
12051                                          * get immediate failure if we didn't launch walreceiver,
12052                                          * and move on to the next state.
12053                                          */
12054                                         currentSource = XLOG_FROM_STREAM;
12055                                         break;
12056
12057                                 case XLOG_FROM_STREAM:
12058
12059                                         /*
12060                                          * Failure while streaming. Most likely, we got here
12061                                          * because streaming replication was terminated, or
12062                                          * promotion was triggered. But we also get here if we
12063                                          * find an invalid record in the WAL streamed from master,
12064                                          * in which case something is seriously wrong. There's
12065                                          * little chance that the problem will just go away, but
12066                                          * PANIC is not good for availability either, especially
12067                                          * in hot standby mode. So, we treat that the same as
12068                                          * disconnection, and retry from archive/pg_wal again. The
12069                                          * WAL in the archive should be identical to what was
12070                                          * streamed, so it's unlikely that it helps, but one can
12071                                          * hope...
12072                                          */
12073
12074                                         /*
12075                                          * Before we leave XLOG_FROM_STREAM state, make sure that
12076                                          * walreceiver is not active, so that it won't overwrite
12077                                          * WAL that we restore from archive.
12078                                          */
12079                                         if (WalRcvStreaming())
12080                                                 ShutdownWalRcv();
12081
12082                                         /*
12083                                          * Before we sleep, re-scan for possible new timelines if
12084                                          * we were requested to recover to the latest timeline.
12085                                          */
12086                                         if (recoveryTargetIsLatest)
12087                                         {
12088                                                 if (rescanLatestTimeLine())
12089                                                 {
12090                                                         currentSource = XLOG_FROM_ARCHIVE;
12091                                                         break;
12092                                                 }
12093                                         }
12094
12095                                         /*
12096                                          * XLOG_FROM_STREAM is the last state in our state
12097                                          * machine, so we've exhausted all the options for
12098                                          * obtaining the requested WAL. We're going to loop back
12099                                          * and retry from the archive, but if it hasn't been long
12100                                          * since last attempt, sleep wal_retrieve_retry_interval
12101                                          * milliseconds to avoid busy-waiting.
12102                                          */
12103                                         now = GetCurrentTimestamp();
12104                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
12105                                                                                                         wal_retrieve_retry_interval))
12106                                         {
12107                                                 long            secs,
12108                                                                         wait_time;
12109                                                 int                     usecs;
12110
12111                                                 TimestampDifference(last_fail_time, now, &secs, &usecs);
12112                                                 wait_time = wal_retrieve_retry_interval -
12113                                                         (secs * 1000 + usecs / 1000);
12114
12115                                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
12116                                                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12117                                                                   wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
12118                                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12119                                                 now = GetCurrentTimestamp();
12120                                         }
12121                                         last_fail_time = now;
12122                                         currentSource = XLOG_FROM_ARCHIVE;
12123                                         break;
12124
12125                                 default:
12126                                         elog(ERROR, "unexpected WAL source %d", currentSource);
12127                         }
12128                 }
12129                 else if (currentSource == XLOG_FROM_PG_WAL)
12130                 {
12131                         /*
12132                          * We just successfully read a file in pg_wal. We prefer files in
12133                          * the archive over ones in pg_wal, so try the next file again
12134                          * from the archive first.
12135                          */
12136                         if (InArchiveRecovery)
12137                                 currentSource = XLOG_FROM_ARCHIVE;
12138                 }
12139
12140                 if (currentSource != oldSource)
12141                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
12142                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12143                                  lastSourceFailed ? "failure" : "success");
12144
12145                 /*
12146                  * We've now handled possible failure. Try to read from the chosen
12147                  * source.
12148                  */
12149                 lastSourceFailed = false;
12150
12151                 switch (currentSource)
12152                 {
12153                         case XLOG_FROM_ARCHIVE:
12154                         case XLOG_FROM_PG_WAL:
12155                                 /* Close any old file we might have open. */
12156                                 if (readFile >= 0)
12157                                 {
12158                                         close(readFile);
12159                                         readFile = -1;
12160                                 }
12161                                 /* Reset curFileTLI if random fetch. */
12162                                 if (randAccess)
12163                                         curFileTLI = 0;
12164
12165                                 /*
12166                                  * Try to restore the file from archive, or read an existing
12167                                  * file from pg_wal.
12168                                  */
12169                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12170                                                                                           currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12171                                                                                           currentSource);
12172                                 if (readFile >= 0)
12173                                         return true;    /* success! */
12174
12175                                 /*
12176                                  * Nope, not found in archive or pg_wal.
12177                                  */
12178                                 lastSourceFailed = true;
12179                                 break;
12180
12181                         case XLOG_FROM_STREAM:
12182                                 {
12183                                         bool            havedata;
12184
12185                                         /*
12186                                          * Check if WAL receiver is still active.
12187                                          */
12188                                         if (!WalRcvStreaming())
12189                                         {
12190                                                 lastSourceFailed = true;
12191                                                 break;
12192                                         }
12193
12194                                         /*
12195                                          * Walreceiver is active, so see if new data has arrived.
12196                                          *
12197                                          * We only advance XLogReceiptTime when we obtain fresh
12198                                          * WAL from walreceiver and observe that we had already
12199                                          * processed everything before the most recent "chunk"
12200                                          * that it flushed to disk.  In steady state where we are
12201                                          * keeping up with the incoming data, XLogReceiptTime will
12202                                          * be updated on each cycle. When we are behind,
12203                                          * XLogReceiptTime will not advance, so the grace time
12204                                          * allotted to conflicting queries will decrease.
12205                                          */
12206                                         if (RecPtr < receivedUpto)
12207                                                 havedata = true;
12208                                         else
12209                                         {
12210                                                 XLogRecPtr      latestChunkStart;
12211
12212                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12213                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12214                                                 {
12215                                                         havedata = true;
12216                                                         if (latestChunkStart <= RecPtr)
12217                                                         {
12218                                                                 XLogReceiptTime = GetCurrentTimestamp();
12219                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
12220                                                         }
12221                                                 }
12222                                                 else
12223                                                         havedata = false;
12224                                         }
12225                                         if (havedata)
12226                                         {
12227                                                 /*
12228                                                  * Great, streamed far enough.  Open the file if it's
12229                                                  * not open already.  Also read the timeline history
12230                                                  * file if we haven't initialized timeline history
12231                                                  * yet; it should be streamed over and present in
12232                                                  * pg_wal by now.  Use XLOG_FROM_STREAM so that source
12233                                                  * info is set correctly and XLogReceiptTime isn't
12234                                                  * changed.
12235                                                  */
12236                                                 if (readFile < 0)
12237                                                 {
12238                                                         if (!expectedTLEs)
12239                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
12240                                                         readFile = XLogFileRead(readSegNo, PANIC,
12241                                                                                                         receiveTLI,
12242                                                                                                         XLOG_FROM_STREAM, false);
12243                                                         Assert(readFile >= 0);
12244                                                 }
12245                                                 else
12246                                                 {
12247                                                         /* just make sure source info is correct... */
12248                                                         readSource = XLOG_FROM_STREAM;
12249                                                         XLogReceiptSource = XLOG_FROM_STREAM;
12250                                                         return true;
12251                                                 }
12252                                                 break;
12253                                         }
12254
12255                                         /*
12256                                          * Data not here yet. Check for trigger, then wait for
12257                                          * walreceiver to wake us up when new WAL arrives.
12258                                          */
12259                                         if (CheckForStandbyTrigger())
12260                                         {
12261                                                 /*
12262                                                  * Note that we don't "return false" immediately here.
12263                                                  * After being triggered, we still want to replay all
12264                                                  * the WAL that was already streamed. It's in pg_wal
12265                                                  * now, so we just treat this as a failure, and the
12266                                                  * state machine will move on to replay the streamed
12267                                                  * WAL from pg_wal, and then recheck the trigger and
12268                                                  * exit replay.
12269                                                  */
12270                                                 lastSourceFailed = true;
12271                                                 break;
12272                                         }
12273
12274                                         /*
12275                                          * Since we have replayed everything we have received so
12276                                          * far and are about to start waiting for more WAL, let's
12277                                          * tell the upstream server our replay location now so
12278                                          * that pg_stat_replication doesn't show stale
12279                                          * information.
12280                                          */
12281                                         if (!streaming_reply_sent)
12282                                         {
12283                                                 WalRcvForceReply();
12284                                                 streaming_reply_sent = true;
12285                                         }
12286
12287                                         /*
12288                                          * Wait for more WAL to arrive. Time out after 5 seconds
12289                                          * to react to a trigger file promptly.
12290                                          */
12291                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
12292                                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12293                                                           5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
12294                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
12295                                         break;
12296                                 }
12297
12298                         default:
12299                                 elog(ERROR, "unexpected WAL source %d", currentSource);
12300                 }
12301
12302                 /*
12303                  * This possibly-long loop needs to handle interrupts of startup
12304                  * process.
12305                  */
12306                 HandleStartupProcInterrupts();
12307         }
12308
12309         return false;                           /* not reached */
12310 }
12311
12312 /*
12313  * Determine what log level should be used to report a corrupt WAL record
12314  * in the current WAL page, previously read by XLogPageRead().
12315  *
12316  * 'emode' is the error mode that would be used to report a file-not-found
12317  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
12318  * we're retrying the exact same record that we've tried previously, only
12319  * complain the first time to keep the noise down.  However, we only do when
12320  * reading from pg_wal, because we don't expect any invalid records in archive
12321  * or in records streamed from master. Files in the archive should be complete,
12322  * and we should never hit the end of WAL because we stop and wait for more WAL
12323  * to arrive before replaying it.
12324  *
12325  * NOTE: This function remembers the RecPtr value it was last called with,
12326  * to suppress repeated messages about the same record. Only call this when
12327  * you are about to ereport(), or you might cause a later message to be
12328  * erroneously suppressed.
12329  */
12330 static int
12331 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12332 {
12333         static XLogRecPtr lastComplaint = 0;
12334
12335         if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12336         {
12337                 if (RecPtr == lastComplaint)
12338                         emode = DEBUG1;
12339                 else
12340                         lastComplaint = RecPtr;
12341         }
12342         return emode;
12343 }
12344
12345 /*
12346  * Check to see whether the user-specified trigger file exists and whether a
12347  * promote request has arrived.  If either condition holds, return true.
12348  */
12349 static bool
12350 CheckForStandbyTrigger(void)
12351 {
12352         struct stat stat_buf;
12353         static bool triggered = false;
12354
12355         if (triggered)
12356                 return true;
12357
12358         if (IsPromoteTriggered())
12359         {
12360                 /*
12361                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12362                  * signal handler. It now leaves the file in place and lets the
12363                  * Startup process do the unlink. This allows Startup to know whether
12364                  * it should create a full checkpoint before starting up (fallback
12365                  * mode). Fast promotion takes precedence.
12366                  */
12367                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12368                 {
12369                         unlink(PROMOTE_SIGNAL_FILE);
12370                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12371                         fast_promote = true;
12372                 }
12373                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12374                 {
12375                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12376                         fast_promote = false;
12377                 }
12378
12379                 ereport(LOG, (errmsg("received promote request")));
12380
12381                 ResetPromoteTriggered();
12382                 triggered = true;
12383                 return true;
12384         }
12385
12386         if (TriggerFile == NULL)
12387                 return false;
12388
12389         if (stat(TriggerFile, &stat_buf) == 0)
12390         {
12391                 ereport(LOG,
12392                                 (errmsg("trigger file found: %s", TriggerFile)));
12393                 unlink(TriggerFile);
12394                 triggered = true;
12395                 fast_promote = true;
12396                 return true;
12397         }
12398         else if (errno != ENOENT)
12399                 ereport(ERROR,
12400                                 (errcode_for_file_access(),
12401                                  errmsg("could not stat trigger file \"%s\": %m",
12402                                                 TriggerFile)));
12403
12404         return false;
12405 }
12406
12407 /*
12408  * Remove the files signaling a standby promotion request.
12409  */
12410 void
12411 RemovePromoteSignalFiles(void)
12412 {
12413         unlink(PROMOTE_SIGNAL_FILE);
12414         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12415 }
12416
12417 /*
12418  * Check to see if a promote request has arrived. Should be
12419  * called by postmaster after receiving SIGUSR1.
12420  */
12421 bool
12422 CheckPromoteSignal(void)
12423 {
12424         struct stat stat_buf;
12425
12426         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12427                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12428                 return true;
12429
12430         return false;
12431 }
12432
12433 /*
12434  * Wake up startup process to replay newly arrived WAL, or to notice that
12435  * failover has been requested.
12436  */
12437 void
12438 WakeupRecovery(void)
12439 {
12440         SetLatch(&XLogCtl->recoveryWakeupLatch);
12441 }
12442
12443 /*
12444  * Update the WalWriterSleeping flag.
12445  */
12446 void
12447 SetWalWriterSleeping(bool sleeping)
12448 {
12449         SpinLockAcquire(&XLogCtl->info_lck);
12450         XLogCtl->WalWriterSleeping = sleeping;
12451         SpinLockRelease(&XLogCtl->info_lck);
12452 }
12453
12454 /*
12455  * Schedule a walreceiver wakeup in the main recovery loop.
12456  */
12457 void
12458 XLogRequestWalReceiverReply(void)
12459 {
12460         doRequestWalReceiverReply = true;
12461 }