granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL write-ahead log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <math.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <unistd.h>
  24
  25 #include "access/clog.h"
  26 #include "access/commit_ts.h"
  27 #include "access/multixact.h"
  28 #include "access/rewriteheap.h"
  29 #include "access/subtrans.h"
  30 #include "access/timeline.h"
  31 #include "access/transam.h"
  32 #include "access/tuptoaster.h"
  33 #include "access/twophase.h"
  34 #include "access/xact.h"
  35 #include "access/xlog_internal.h"
  36 #include "access/xloginsert.h"
  37 #include "access/xlogreader.h"
  38 #include "access/xlogutils.h"
  39 #include "catalog/catversion.h"
  40 #include "catalog/pg_control.h"
  41 #include "catalog/pg_database.h"
  42 #include "commands/tablespace.h"
  43 #include "miscadmin.h"
  44 #include "pgstat.h"
  45 #include "port/atomics.h"
  46 #include "postmaster/bgwriter.h"
  47 #include "postmaster/walwriter.h"
  48 #include "postmaster/startup.h"
  49 #include "replication/basebackup.h"
  50 #include "replication/logical.h"
  51 #include "replication/slot.h"
  52 #include "replication/origin.h"
  53 #include "replication/snapbuild.h"
  54 #include "replication/walreceiver.h"
  55 #include "replication/walsender.h"
  56 #include "storage/bufmgr.h"
  57 #include "storage/fd.h"
  58 #include "storage/ipc.h"
  59 #include "storage/large_object.h"
  60 #include "storage/latch.h"
  61 #include "storage/pmsignal.h"
  62 #include "storage/predicate.h"
  63 #include "storage/proc.h"
  64 #include "storage/procarray.h"
  65 #include "storage/reinit.h"
  66 #include "storage/smgr.h"
  67 #include "storage/spin.h"
  68 #include "utils/backend_random.h"
  69 #include "utils/builtins.h"
  70 #include "utils/guc.h"
  71 #include "utils/memutils.h"
  72 #include "utils/pg_lsn.h"
  73 #include "utils/ps_status.h"
  74 #include "utils/relmapper.h"
  75 #include "utils/snapmgr.h"
  76 #include "utils/timestamp.h"
  77 #include "pg_trace.h"
  78
  79 extern uint32 bootstrap_data_checksum_version;
  80
  81 /* File path names (all relative to $PGDATA) */
  82 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  83 #define RECOVERY_COMMAND_DONE   "recovery.done"
  84 #define PROMOTE_SIGNAL_FILE             "promote"
  85 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  86
  87
  88 /* User-settable parameters */
  89 int                     max_wal_size_mb = 1024; /* 1 GB */
  90 int                     min_wal_size_mb = 80;   /* 80 MB */
  91 int                     wal_keep_segments = 0;
  92 int                     XLOGbuffers = -1;
  93 int                     XLogArchiveTimeout = 0;
  94 int                     XLogArchiveMode = ARCHIVE_MODE_OFF;
  95 char       *XLogArchiveCommand = NULL;
  96 bool            EnableHotStandby = false;
  97 bool            fullPageWrites = true;
  98 bool            wal_log_hints = false;
  99 bool            wal_compression = false;
 100 char       *wal_consistency_checking_string = NULL;
 101 bool       *wal_consistency_checking = NULL;
 102 bool            log_checkpoints = false;
 103 int                     sync_method = DEFAULT_SYNC_METHOD;
 104 int                     wal_level = WAL_LEVEL_MINIMAL;
 105 int                     CommitDelay = 0;        /* precommit delay in microseconds */
 106 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 107 int                     wal_retrieve_retry_interval = 5000;
 108
 109 #ifdef WAL_DEBUG
 110 bool            XLOG_DEBUG = false;
 111 #endif
 112
 113 int                     wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
 114
 115 /*
 116  * Number of WAL insertion locks to use. A higher value allows more insertions
 117  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
 118  * which needs to iterate all the locks.
 119  */
 120 #define NUM_XLOGINSERT_LOCKS  8
 121
 122 /*
 123  * Max distance from last checkpoint, before triggering a new xlog-based
 124  * checkpoint.
 125  */
 126 int                     CheckPointSegments;
 127
 128 /* Estimated distance between checkpoints, in bytes */
 129 static double CheckPointDistanceEstimate = 0;
 130 static double PrevCheckPointDistance = 0;
 131
 132 /*
 133  * GUC support
 134  */
 135 const struct config_enum_entry sync_method_options[] = {
 136         {"fsync", SYNC_METHOD_FSYNC, false},
 137 #ifdef HAVE_FSYNC_WRITETHROUGH
 138         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 139 #endif
 140 #ifdef HAVE_FDATASYNC
 141         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 142 #endif
 143 #ifdef OPEN_SYNC_FLAG
 144         {"open_sync", SYNC_METHOD_OPEN, false},
 145 #endif
 146 #ifdef OPEN_DATASYNC_FLAG
 147         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 148 #endif
 149         {NULL, 0, false}
 150 };
 151
 152
 153 /*
 154  * Although only "on", "off", and "always" are documented,
 155  * we accept all the likely variants of "on" and "off".
 156  */
 157 const struct config_enum_entry archive_mode_options[] = {
 158         {"always", ARCHIVE_MODE_ALWAYS, false},
 159         {"on", ARCHIVE_MODE_ON, false},
 160         {"off", ARCHIVE_MODE_OFF, false},
 161         {"true", ARCHIVE_MODE_ON, true},
 162         {"false", ARCHIVE_MODE_OFF, true},
 163         {"yes", ARCHIVE_MODE_ON, true},
 164         {"no", ARCHIVE_MODE_OFF, true},
 165         {"1", ARCHIVE_MODE_ON, true},
 166         {"0", ARCHIVE_MODE_OFF, true},
 167         {NULL, 0, false}
 168 };
 169
 170 /*
 171  * Statistics for current checkpoint are collected in this global struct.
 172  * Because only the checkpointer or a stand-alone backend can perform
 173  * checkpoints, this will be unused in normal backends.
 174  */
 175 CheckpointStatsData CheckpointStats;
 176
 177 /*
 178  * ThisTimeLineID will be same in all backends --- it identifies current
 179  * WAL timeline for the database system.
 180  */
 181 TimeLineID      ThisTimeLineID = 0;
 182
 183 /*
 184  * Are we doing recovery from XLOG?
 185  *
 186  * This is only ever true in the startup process; it should be read as meaning
 187  * "this process is replaying WAL records", rather than "the system is in
 188  * recovery mode".  It should be examined primarily by functions that need
 189  * to act differently when called from a WAL redo function (e.g., to skip WAL
 190  * logging).  To check whether the system is in recovery regardless of which
 191  * process you're running in, use RecoveryInProgress() but only after shared
 192  * memory startup and lock initialization.
 193  */
 194 bool            InRecovery = false;
 195
 196 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 197 HotStandbyState standbyState = STANDBY_DISABLED;
 198
 199 static XLogRecPtr LastRec;
 200
 201 /* Local copy of WalRcv->receivedUpto */
 202 static XLogRecPtr receivedUpto = 0;
 203 static TimeLineID receiveTLI = 0;
 204
 205 /*
 206  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 207  * the replayed WAL records indicate. It's initialized with full_page_writes
 208  * that the recovery starting checkpoint record indicates, and then updated
 209  * each time XLOG_FPW_CHANGE record is replayed.
 210  */
 211 static bool lastFullPageWrites;
 212
 213 /*
 214  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 215  * known, need to check the shared state".
 216  */
 217 static bool LocalRecoveryInProgress = true;
 218
 219 /*
 220  * Local copy of SharedHotStandbyActive variable. False actually means "not
 221  * known, need to check the shared state".
 222  */
 223 static bool LocalHotStandbyActive = false;
 224
 225 /*
 226  * Local state for XLogInsertAllowed():
 227  *              1: unconditionally allowed to insert XLOG
 228  *              0: unconditionally not allowed to insert XLOG
 229  *              -1: must check RecoveryInProgress(); disallow until it is false
 230  * Most processes start with -1 and transition to 1 after seeing that recovery
 231  * is not in progress.  But we can also force the value for special cases.
 232  * The coding in XLogInsertAllowed() depends on the first two of these states
 233  * being numerically the same as bool true and false.
 234  */
 235 static int      LocalXLogInsertAllowed = -1;
 236
 237 /*
 238  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 239  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 240  * currently recovering using offline XLOG archives. These variables are only
 241  * valid in the startup process.
 242  *
 243  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 244  * currently performing crash recovery using only XLOG files in pg_wal, but
 245  * will switch to using offline XLOG archives as soon as we reach the end of
 246  * WAL in pg_wal.
 247 */
 248 bool            ArchiveRecoveryRequested = false;
 249 bool            InArchiveRecovery = false;
 250
 251 /* Was the last xlog file restored from archive, or local? */
 252 static bool restoredFromArchive = false;
 253
 254 /* Buffers dedicated to consistency checks of size BLCKSZ */
 255 static char *replay_image_masked = NULL;
 256 static char *master_image_masked = NULL;
 257
 258 /* options taken from recovery.conf for archive recovery */
 259 char       *recoveryRestoreCommand = NULL;
 260 static char *recoveryEndCommand = NULL;
 261 static char *archiveCleanupCommand = NULL;
 262 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 263 static bool recoveryTargetInclusive = true;
 264 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
 265 static TransactionId recoveryTargetXid;
 266 static TimestampTz recoveryTargetTime;
 267 static char *recoveryTargetName;
 268 static XLogRecPtr recoveryTargetLSN;
 269 static int      recovery_min_apply_delay = 0;
 270 static TimestampTz recoveryDelayUntilTime;
 271
 272 /* options taken from recovery.conf for XLOG streaming */
 273 static bool StandbyModeRequested = false;
 274 static char *PrimaryConnInfo = NULL;
 275 static char *PrimarySlotName = NULL;
 276 static char *TriggerFile = NULL;
 277
 278 /* are we currently in standby mode? */
 279 bool            StandbyMode = false;
 280
 281 /* whether request for fast promotion has been made yet */
 282 static bool fast_promote = false;
 283
 284 /*
 285  * if recoveryStopsBefore/After returns true, it saves information of the stop
 286  * point here
 287  */
 288 static TransactionId recoveryStopXid;
 289 static TimestampTz recoveryStopTime;
 290 static XLogRecPtr recoveryStopLSN;
 291 static char recoveryStopName[MAXFNAMELEN];
 292 static bool recoveryStopAfter;
 293
 294 /*
 295  * During normal operation, the only timeline we care about is ThisTimeLineID.
 296  * During recovery, however, things are more complicated.  To simplify life
 297  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 298  * scan through the WAL history (that is, it is the line that was active when
 299  * the currently-scanned WAL record was generated).  We also need these
 300  * timeline values:
 301  *
 302  * recoveryTargetTLI: the desired timeline that we want to end in.
 303  *
 304  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 305  *
 306  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 307  * its known parents, newest first (so recoveryTargetTLI is always the
 308  * first list member).  Only these TLIs are expected to be seen in the WAL
 309  * segments we read, and indeed only these TLIs will be considered as
 310  * candidate WAL files to open at all.
 311  *
 312  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 313  * (This is not necessarily the same as ThisTimeLineID, because we could
 314  * be scanning data that was copied from an ancestor timeline when the current
 315  * file was created.)  During a sequential scan we do not allow this value
 316  * to decrease.
 317  */
 318 static TimeLineID recoveryTargetTLI;
 319 static bool recoveryTargetIsLatest = false;
 320 static List *expectedTLEs;
 321 static TimeLineID curFileTLI;
 322
 323 /*
 324  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 325  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 326  * end+1 of the last record, and is reset when we end a top-level transaction,
 327  * or start a new one; so it can be used to tell if the current transaction has
 328  * created any XLOG records.
 329  *
 330  * While in parallel mode, this may not be fully up to date.  When committing,
 331  * a transaction can assume this covers all xlog records written either by the
 332  * user backend or by any parallel worker which was present at any point during
 333  * the transaction.  But when aborting, or when still in parallel mode, other
 334  * parallel backends may have written WAL records at later LSNs than the value
 335  * stored here.  The parallel leader advances its own copy, when necessary,
 336  * in WaitForParallelWorkersToFinish.
 337  */
 338 XLogRecPtr      ProcLastRecPtr = InvalidXLogRecPtr;
 339 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 340 XLogRecPtr      XactLastCommitEnd = InvalidXLogRecPtr;
 341
 342 /*
 343  * RedoRecPtr is this backend's local copy of the REDO record pointer
 344  * (which is almost but not quite the same as a pointer to the most recent
 345  * CHECKPOINT record).  We update this from the shared-memory copy,
 346  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 347  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
 348  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 349  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 350  * InitXLOGAccess.
 351  */
 352 static XLogRecPtr RedoRecPtr;
 353
 354 /*
 355  * doPageWrites is this backend's local copy of (forcePageWrites ||
 356  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
 357  * a full-page image of a page need to be taken.
 358  */
 359 static bool doPageWrites;
 360
 361 /* Has the recovery code requested a walreceiver wakeup? */
 362 static bool doRequestWalReceiverReply;
 363
 364 /*
 365  * RedoStartLSN points to the checkpoint's REDO location which is specified
 366  * in a backup label file, backup history file or control file. In standby
 367  * mode, XLOG streaming usually starts from the position where an invalid
 368  * record was found. But if we fail to read even the initial checkpoint
 369  * record, we use the REDO location instead of the checkpoint location as
 370  * the start position of XLOG streaming. Otherwise we would have to jump
 371  * backwards to the REDO location after reading the checkpoint record,
 372  * because the REDO record can precede the checkpoint record.
 373  */
 374 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 375
 376 /*----------
 377  * Shared-memory data structures for XLOG control
 378  *
 379  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 380  * the log up to (all records before that point must be written or fsynced).
 381  * LogwrtResult indicates the byte positions we have already written/fsynced.
 382  * These structs are identical but are declared separately to indicate their
 383  * slightly different functions.
 384  *
 385  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 386  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 387  * this arrangement is that the value can be examined by code that already
 388  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 389  * to the shared variable, each backend has a private copy of LogwrtResult,
 390  * which is updated when convenient.
 391  *
 392  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 393  * (protected by info_lck), but we don't need to cache any copies of it.
 394  *
 395  * info_lck is only held long enough to read/update the protected variables,
 396  * so it's a plain spinlock.  The other locks are held longer (potentially
 397  * over I/O operations), so we use LWLocks for them.  These locks are:
 398  *
 399  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 400  * It is only held while initializing and changing the mapping.  If the
 401  * contents of the buffer being replaced haven't been written yet, the mapping
 402  * lock is released while the write is done, and reacquired afterwards.
 403  *
 404  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 405  * XLogFlush).
 406  *
 407  * ControlFileLock: must be held to read/update control file or create
 408  * new log file.
 409  *
 410  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 411  * only one checkpointer at a time; currently, with all checkpoints done by
 412  * the checkpointer, this is just pro forma).
 413  *
 414  *----------
 415  */
 416
 417 typedef struct XLogwrtRqst
 418 {
 419         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 420         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 421 } XLogwrtRqst;
 422
 423 typedef struct XLogwrtResult
 424 {
 425         XLogRecPtr      Write;                  /* last byte + 1 written out */
 426         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 427 } XLogwrtResult;
 428
 429 /*
 430  * Inserting to WAL is protected by a small fixed number of WAL insertion
 431  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 432  * matter which one. To lock out other concurrent insertions, you must hold
 433  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 434  * indicator of how far the insertion has progressed (insertingAt).
 435  *
 436  * The insertingAt values are read when a process wants to flush WAL from
 437  * the in-memory buffers to disk, to check that all the insertions to the
 438  * region the process is about to write out have finished. You could simply
 439  * wait for all currently in-progress insertions to finish, but the
 440  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 441  * so that you only wait for the insertions that are modifying the buffers
 442  * you're about to write out.
 443  *
 444  * This isn't just an optimization. If all the WAL buffers are dirty, an
 445  * inserter that's holding a WAL insert lock might need to evict an old WAL
 446  * buffer, which requires flushing the WAL. If it's possible for an inserter
 447  * to block on another inserter unnecessarily, deadlock can arise when two
 448  * inserters holding a WAL insert lock wait for each other to finish their
 449  * insertion.
 450  *
 451  * Small WAL records that don't cross a page boundary never update the value,
 452  * the WAL record is just copied to the page and the lock is released. But
 453  * to avoid the deadlock-scenario explained above, the indicator is always
 454  * updated before sleeping while holding an insertion lock.
 455  *
 456  * lastImportantAt contains the LSN of the last important WAL record inserted
 457  * using a given lock. This value is used to detect if there has been
 458  * important WAL activity since the last time some action, like a checkpoint,
 459  * was performed - allowing to not repeat the action if not. The LSN is
 460  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
 461  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
 462  * records.  Tracking the WAL activity directly in WALInsertLock has the
 463  * advantage of not needing any additional locks to update the value.
 464  */
 465 typedef struct
 466 {
 467         LWLock          lock;
 468         XLogRecPtr      insertingAt;
 469         XLogRecPtr      lastImportantAt;
 470 } WALInsertLock;
 471
 472 /*
 473  * All the WAL insertion locks are allocated as an array in shared memory. We
 474  * force the array stride to be a power of 2, which saves a few cycles in
 475  * indexing, but more importantly also ensures that individual slots don't
 476  * cross cache line boundaries. (Of course, we have to also ensure that the
 477  * array start address is suitably aligned.)
 478  */
 479 typedef union WALInsertLockPadded
 480 {
 481         WALInsertLock l;
 482         char            pad[PG_CACHE_LINE_SIZE];
 483 } WALInsertLockPadded;
 484
 485 /*
 486  * State of an exclusive backup, necessary to control concurrent activities
 487  * across sessions when working on exclusive backups.
 488  *
 489  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
 490  * running, to be more precise pg_start_backup() is not being executed for
 491  * an exclusive backup and there is no exclusive backup in progress.
 492  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
 493  * exclusive backup.
 494  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
 495  * running and an exclusive backup is in progress. pg_stop_backup() is
 496  * needed to finish it.
 497  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
 498  * exclusive backup.
 499  */
 500 typedef enum ExclusiveBackupState
 501 {
 502         EXCLUSIVE_BACKUP_NONE = 0,
 503         EXCLUSIVE_BACKUP_STARTING,
 504         EXCLUSIVE_BACKUP_IN_PROGRESS,
 505         EXCLUSIVE_BACKUP_STOPPING
 506 } ExclusiveBackupState;
 507
 508 /*
 509  * Session status of running backup, used for sanity checks in SQL-callable
 510  * functions to start and stop backups.
 511  */
 512 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
 513
 514 /*
 515  * Shared state data for WAL insertion.
 516  */
 517 typedef struct XLogCtlInsert
 518 {
 519         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 520
 521         /*
 522          * CurrBytePos is the end of reserved WAL. The next record will be
 523          * inserted at that position. PrevBytePos is the start position of the
 524          * previously inserted (or rather, reserved) record - it is copied to the
 525          * prev-link of the next record. These are stored as "usable byte
 526          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 527          */
 528         uint64          CurrBytePos;
 529         uint64          PrevBytePos;
 530
 531         /*
 532          * Make sure the above heavily-contended spinlock and byte positions are
 533          * on their own cache line. In particular, the RedoRecPtr and full page
 534          * write variables below should be on a different cache line. They are
 535          * read on every WAL insertion, but updated rarely, and we don't want
 536          * those reads to steal the cache line containing Curr/PrevBytePos.
 537          */
 538         char            pad[PG_CACHE_LINE_SIZE];
 539
 540         /*
 541          * fullPageWrites is the master copy used by all backends to determine
 542          * whether to write full-page to WAL, instead of using process-local one.
 543          * This is required because, when full_page_writes is changed by SIGHUP,
 544          * we must WAL-log it before it actually affects WAL-logging by backends.
 545          * Checkpointer sets at startup or after SIGHUP.
 546          *
 547          * To read these fields, you must hold an insertion lock. To modify them,
 548          * you must hold ALL the locks.
 549          */
 550         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 551         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 552         bool            fullPageWrites;
 553
 554         /*
 555          * exclusiveBackupState indicates the state of an exclusive backup (see
 556          * comments of ExclusiveBackupState for more details). nonExclusiveBackups
 557          * is a counter indicating the number of streaming base backups currently
 558          * in progress. forcePageWrites is set to true when either of these is
 559          * non-zero. lastBackupStart is the latest checkpoint redo location used
 560          * as a starting point for an online backup.
 561          */
 562         ExclusiveBackupState exclusiveBackupState;
 563         int                     nonExclusiveBackups;
 564         XLogRecPtr      lastBackupStart;
 565
 566         /*
 567          * WAL insertion locks.
 568          */
 569         WALInsertLockPadded *WALInsertLocks;
 570 } XLogCtlInsert;
 571
 572 /*
 573  * Total shared-memory state for XLOG.
 574  */
 575 typedef struct XLogCtlData
 576 {
 577         XLogCtlInsert Insert;
 578
 579         /* Protected by info_lck: */
 580         XLogwrtRqst LogwrtRqst;
 581         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 582         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 583         TransactionId ckptXid;
 584         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 585         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 586
 587         XLogSegNo       lastRemovedSegNo;       /* latest removed/recycled XLOG segment */
 588
 589         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 590         XLogRecPtr      unloggedLSN;
 591         slock_t         ulsn_lck;
 592
 593         /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
 594         pg_time_t       lastSegSwitchTime;
 595         XLogRecPtr      lastSegSwitchLSN;
 596
 597         /*
 598          * Protected by info_lck and WALWriteLock (you must hold either lock to
 599          * read it, but both to update)
 600          */
 601         XLogwrtResult LogwrtResult;
 602
 603         /*
 604          * Latest initialized page in the cache (last byte position + 1).
 605          *
 606          * To change the identity of a buffer (and InitializedUpTo), you need to
 607          * hold WALBufMappingLock.  To change the identity of a buffer that's
 608          * still dirty, the old page needs to be written out first, and for that
 609          * you need WALWriteLock, and you need to ensure that there are no
 610          * in-progress insertions to the page by calling
 611          * WaitXLogInsertionsToFinish().
 612          */
 613         XLogRecPtr      InitializedUpTo;
 614
 615         /*
 616          * These values do not change after startup, although the pointed-to pages
 617          * and xlblocks values certainly do.  xlblock values are protected by
 618          * WALBufMappingLock.
 619          */
 620         char       *pages;                      /* buffers for unwritten XLOG pages */
 621         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 622         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 623
 624         /*
 625          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 626          * If we created a new timeline when the system was started up,
 627          * PrevTimeLineID is the old timeline's ID that we forked off from.
 628          * Otherwise it's equal to ThisTimeLineID.
 629          */
 630         TimeLineID      ThisTimeLineID;
 631         TimeLineID      PrevTimeLineID;
 632
 633         /*
 634          * archiveCleanupCommand is read from recovery.conf but needs to be in
 635          * shared memory so that the checkpointer process can access it.
 636          */
 637         char            archiveCleanupCommand[MAXPGPATH];
 638
 639         /*
 640          * SharedRecoveryInProgress indicates if we're still in crash or archive
 641          * recovery.  Protected by info_lck.
 642          */
 643         bool            SharedRecoveryInProgress;
 644
 645         /*
 646          * SharedHotStandbyActive indicates if we're still in crash or archive
 647          * recovery.  Protected by info_lck.
 648          */
 649         bool            SharedHotStandbyActive;
 650
 651         /*
 652          * WalWriterSleeping indicates whether the WAL writer is currently in
 653          * low-power mode (and hence should be nudged if an async commit occurs).
 654          * Protected by info_lck.
 655          */
 656         bool            WalWriterSleeping;
 657
 658         /*
 659          * recoveryWakeupLatch is used to wake up the startup process to continue
 660          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 661          * to appear.
 662          */
 663         Latch           recoveryWakeupLatch;
 664
 665         /*
 666          * During recovery, we keep a copy of the latest checkpoint record here.
 667          * lastCheckPointRecPtr points to start of checkpoint record and
 668          * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
 669          * checkpointer when it wants to create a restartpoint.
 670          *
 671          * Protected by info_lck.
 672          */
 673         XLogRecPtr      lastCheckPointRecPtr;
 674         XLogRecPtr      lastCheckPointEndPtr;
 675         CheckPoint      lastCheckPoint;
 676
 677         /*
 678          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 679          * replayed. When we're currently replaying a record, ie. in a redo
 680          * function, replayEndRecPtr points to the end+1 of the record being
 681          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 682          */
 683         XLogRecPtr      lastReplayedEndRecPtr;
 684         TimeLineID      lastReplayedTLI;
 685         XLogRecPtr      replayEndRecPtr;
 686         TimeLineID      replayEndTLI;
 687         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 688         TimestampTz recoveryLastXTime;
 689
 690         /*
 691          * timestamp of when we started replaying the current chunk of WAL data,
 692          * only relevant for replication or archive recovery
 693          */
 694         TimestampTz currentChunkStartTime;
 695         /* Are we requested to pause recovery? */
 696         bool            recoveryPause;
 697
 698         /*
 699          * lastFpwDisableRecPtr points to the start of the last replayed
 700          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 701          */
 702         XLogRecPtr      lastFpwDisableRecPtr;
 703
 704         slock_t         info_lck;               /* locks shared variables shown above */
 705 } XLogCtlData;
 706
 707 static XLogCtlData *XLogCtl = NULL;
 708
 709 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 710 static WALInsertLockPadded *WALInsertLocks = NULL;
 711
 712 /*
 713  * We maintain an image of pg_control in shared memory.
 714  */
 715 static ControlFileData *ControlFile = NULL;
 716
 717 /*
 718  * Calculate the amount of space left on the page after 'endptr'. Beware
 719  * multiple evaluation!
 720  */
 721 #define INSERT_FREESPACE(endptr)        \
 722         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 723
 724 /* Macro to advance to next buffer index. */
 725 #define NextBufIdx(idx)         \
 726                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 727
 728 /*
 729  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 730  * would hold if it was in cache, the page containing 'recptr'.
 731  */
 732 #define XLogRecPtrToBufIdx(recptr)      \
 733         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 734
 735 /*
 736  * These are the number of bytes in a WAL page usable for WAL data.
 737  */
 738 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 739
 740 /* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
 741 #define ConvertToXSegs(x, segsize)      \
 742         (x / ((segsize) / (1024 * 1024)))
 743
 744 /* The number of bytes in a WAL segment usable for WAL data. */
 745 static int      UsableBytesInSegment;
 746
 747 /*
 748  * Private, possibly out-of-date copy of shared LogwrtResult.
 749  * See discussion above.
 750  */
 751 static XLogwrtResult LogwrtResult = {0, 0};
 752
 753 /*
 754  * Codes indicating where we got a WAL file from during recovery, or where
 755  * to attempt to get one.
 756  */
 757 typedef enum
 758 {
 759         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 760         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 761         XLOG_FROM_PG_WAL,                       /* existing file in pg_wal */
 762         XLOG_FROM_STREAM                        /* streamed from master */
 763 } XLogSource;
 764
 765 /* human-readable names for XLogSources, for debugging output */
 766 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
 767
 768 /*
 769  * openLogFile is -1 or a kernel FD for an open log file segment.
 770  * When it's open, openLogOff is the current seek offset in the file.
 771  * openLogSegNo identifies the segment.  These variables are only
 772  * used to write the XLOG, and so will normally refer to the active segment.
 773  */
 774 static int      openLogFile = -1;
 775 static XLogSegNo openLogSegNo = 0;
 776 static uint32 openLogOff = 0;
 777
 778 /*
 779  * These variables are used similarly to the ones above, but for reading
 780  * the XLOG.  Note, however, that readOff generally represents the offset
 781  * of the page just read, not the seek position of the FD itself, which
 782  * will be just past that page. readLen indicates how much of the current
 783  * page has been read into readBuf, and readSource indicates where we got
 784  * the currently open file from.
 785  */
 786 static int      readFile = -1;
 787 static XLogSegNo readSegNo = 0;
 788 static uint32 readOff = 0;
 789 static uint32 readLen = 0;
 790 static XLogSource readSource = 0;       /* XLOG_FROM_* code */
 791
 792 /*
 793  * Keeps track of which source we're currently reading from. This is
 794  * different from readSource in that this is always set, even when we don't
 795  * currently have a WAL file open. If lastSourceFailed is set, our last
 796  * attempt to read from currentSource failed, and we should try another source
 797  * next.
 798  */
 799 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 800 static bool lastSourceFailed = false;
 801
 802 typedef struct XLogPageReadPrivate
 803 {
 804         int                     emode;
 805         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 806         bool            randAccess;
 807 } XLogPageReadPrivate;
 808
 809 /*
 810  * These variables track when we last obtained some WAL data to process,
 811  * and where we got it from.  (XLogReceiptSource is initially the same as
 812  * readSource, but readSource gets reset to zero when we don't have data
 813  * to process right now.  It is also different from currentSource, which
 814  * also changes when we try to read from a source and fail, while
 815  * XLogReceiptSource tracks where we last successfully read some WAL.)
 816  */
 817 static TimestampTz XLogReceiptTime = 0;
 818 static XLogSource XLogReceiptSource = 0;        /* XLOG_FROM_* code */
 819
 820 /* State information for XLOG reading */
 821 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 822 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 823
 824 /*
 825  * Local copies of equivalent fields in the control file.  When running
 826  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
 827  * expect to replay all the WAL available, and updateMinRecoveryPoint is
 828  * switched to false to prevent any updates while replaying records.
 829  * Those values are kept consistent as long as crash recovery runs.
 830  */
 831 static XLogRecPtr minRecoveryPoint;
 832 static TimeLineID minRecoveryPointTLI;
 833 static bool updateMinRecoveryPoint = true;
 834
 835 /*
 836  * Have we reached a consistent database state? In crash recovery, we have
 837  * to replay all the WAL, so reachedConsistency is never set. During archive
 838  * recovery, the database is consistent once minRecoveryPoint is reached.
 839  */
 840 bool            reachedConsistency = false;
 841
 842 static bool InRedo = false;
 843
 844 /* Have we launched bgwriter during recovery? */
 845 static bool bgwriterLaunched = false;
 846
 847 /* For WALInsertLockAcquire/Release functions */
 848 static int      MyLockNo = 0;
 849 static bool holdingAllLocks = false;
 850
 851 #ifdef WAL_DEBUG
 852 static MemoryContext walDebugCxt = NULL;
 853 #endif
 854
 855 static void readRecoveryCommandFile(void);
 856 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
 857 static bool recoveryStopsBefore(XLogReaderState *record);
 858 static bool recoveryStopsAfter(XLogReaderState *record);
 859 static void recoveryPausesHere(void);
 860 static bool recoveryApplyDelay(XLogReaderState *record);
 861 static void SetLatestXTime(TimestampTz xtime);
 862 static void SetCurrentChunkStartTime(TimestampTz xtime);
 863 static void CheckRequiredParameterValues(void);
 864 static void XLogReportParameters(void);
 865 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 866                                         TimeLineID prevTLI);
 867 static void LocalSetXLogInsertAllowed(void);
 868 static void CreateEndOfRecoveryRecord(void);
 869 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 870 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 871 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 872
 873 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 874 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 875 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 876 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 877                                            bool find_free, XLogSegNo max_segno,
 878                                            bool use_lock);
 879 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 880                          int source, bool notfoundOk);
 881 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 882 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 883                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 884                          TimeLineID *readTLI);
 885 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 886                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 887 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 888 static void XLogFileClose(void);
 889 static void PreallocXlogFiles(XLogRecPtr endptr);
 890 static void RemoveTempXlogFiles(void);
 891 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
 892 static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
 893 static void UpdateLastRemovedPtr(char *filename);
 894 static void ValidateXLOGDirectoryStructure(void);
 895 static void CleanupBackupHistory(void);
 896 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 897 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 898                    int emode, bool fetching_ckpt);
 899 static void CheckRecoveryConsistency(void);
 900 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 901                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 902 static bool rescanLatestTimeLine(void);
 903 static void WriteControlFile(void);
 904 static void ReadControlFile(void);
 905 static char *str_time(pg_time_t tnow);
 906 static bool CheckForStandbyTrigger(void);
 907
 908 #ifdef WAL_DEBUG
 909 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 910 #endif
 911 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 912 static void pg_start_backup_callback(int code, Datum arg);
 913 static void pg_stop_backup_callback(int code, Datum arg);
 914 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 915                                   bool *backupEndRequired, bool *backupFromStandby);
 916 static bool read_tablespace_map(List **tablespaces);
 917
 918 static void rm_redo_error_callback(void *arg);
 919 static int      get_sync_bit(int method);
 920
 921 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 922                                         XLogRecData *rdata,
 923                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
 924 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 925                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 926 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 927                                   XLogRecPtr *PrevPtr);
 928 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 929 static char *GetXLogBuffer(XLogRecPtr ptr);
 930 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 931 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 932 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 933 static void checkXLogConsistency(XLogReaderState *record);
 934
 935 static void WALInsertLockAcquire(void);
 936 static void WALInsertLockAcquireExclusive(void);
 937 static void WALInsertLockRelease(void);
 938 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 939
 940 /*
 941  * Insert an XLOG record represented by an already-constructed chain of data
 942  * chunks.  This is a low-level routine; to construct the WAL record header
 943  * and data, use the higher-level routines in xloginsert.c.
 944  *
 945  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 946  * WAL record applies to, that were not included in the record as full page
 947  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
 948  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 949  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 950  * record is always inserted.
 951  *
 952  * 'flags' gives more in-depth control on the record being inserted. See
 953  * XLogSetRecordFlags() for details.
 954  *
 955  * The first XLogRecData in the chain must be for the record header, and its
 956  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 957  * xl_crc fields in the header, the rest of the header must already be filled
 958  * by the caller.
 959  *
 960  * Returns XLOG pointer to end of record (beginning of next record).
 961  * This can be used as LSN for data pages affected by the logged action.
 962  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 963  * before the data page can be written out.  This implements the basic
 964  * WAL rule "write the log before the data".)
 965  */
 966 XLogRecPtr
 967 XLogInsertRecord(XLogRecData *rdata,
 968                                  XLogRecPtr fpw_lsn,
 969                                  uint8 flags)
 970 {
 971         XLogCtlInsert *Insert = &XLogCtl->Insert;
 972         pg_crc32c       rdata_crc;
 973         bool            inserted;
 974         XLogRecord *rechdr = (XLogRecord *) rdata->data;
 975         uint8           info = rechdr->xl_info & ~XLR_INFO_MASK;
 976         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
 977                                                            info == XLOG_SWITCH);
 978         XLogRecPtr      StartPos;
 979         XLogRecPtr      EndPos;
 980
 981         /* we assume that all of the record header is in the first chunk */
 982         Assert(rdata->len >= SizeOfXLogRecord);
 983
 984         /* cross-check on whether we should be here or not */
 985         if (!XLogInsertAllowed())
 986                 elog(ERROR, "cannot make new WAL entries during recovery");
 987
 988         /*----------
 989          *
 990          * We have now done all the preparatory work we can without holding a
 991          * lock or modifying shared state. From here on, inserting the new WAL
 992          * record to the shared WAL buffer cache is a two-step process:
 993          *
 994          * 1. Reserve the right amount of space from the WAL. The current head of
 995          *        reserved space is kept in Insert->CurrBytePos, and is protected by
 996          *        insertpos_lck.
 997          *
 998          * 2. Copy the record to the reserved WAL space. This involves finding the
 999          *        correct WAL buffer containing the reserved space, and copying the
1000          *        record in place. This can be done concurrently in multiple processes.
1001          *
1002          * To keep track of which insertions are still in-progress, each concurrent
1003          * inserter acquires an insertion lock. In addition to just indicating that
1004          * an insertion is in progress, the lock tells others how far the inserter
1005          * has progressed. There is a small fixed number of insertion locks,
1006          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1007          * boundary, it updates the value stored in the lock to the how far it has
1008          * inserted, to allow the previous buffer to be flushed.
1009          *
1010          * Holding onto an insertion lock also protects RedoRecPtr and
1011          * fullPageWrites from changing until the insertion is finished.
1012          *
1013          * Step 2 can usually be done completely in parallel. If the required WAL
1014          * page is not initialized yet, you have to grab WALBufMappingLock to
1015          * initialize it, but the WAL writer tries to do that ahead of insertions
1016          * to avoid that from happening in the critical path.
1017          *
1018          *----------
1019          */
1020         START_CRIT_SECTION();
1021         if (isLogSwitch)
1022                 WALInsertLockAcquireExclusive();
1023         else
1024                 WALInsertLockAcquire();
1025
1026         /*
1027          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
1028          * If so, may have to go back and have the caller recompute everything.
1029          * This can only happen just after a checkpoint, so it's better to be slow
1030          * in this case and fast otherwise.
1031          *
1032          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1033          * affect the contents of the XLOG record, so we'll update our local copy
1034          * but not force a recomputation.  (If doPageWrites was just turned off,
1035          * we could recompute the record without full pages, but we choose not to
1036          * bother.)
1037          */
1038         if (RedoRecPtr != Insert->RedoRecPtr)
1039         {
1040                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1041                 RedoRecPtr = Insert->RedoRecPtr;
1042         }
1043         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1044
1045         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
1046         {
1047                 /*
1048                  * Oops, some buffer now needs to be backed up that the caller didn't
1049                  * back up.  Start over.
1050                  */
1051                 WALInsertLockRelease();
1052                 END_CRIT_SECTION();
1053                 return InvalidXLogRecPtr;
1054         }
1055
1056         /*
1057          * Reserve space for the record in the WAL. This also sets the xl_prev
1058          * pointer.
1059          */
1060         if (isLogSwitch)
1061                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1062         else
1063         {
1064                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1065                                                                   &rechdr->xl_prev);
1066                 inserted = true;
1067         }
1068
1069         if (inserted)
1070         {
1071                 /*
1072                  * Now that xl_prev has been filled in, calculate CRC of the record
1073                  * header.
1074                  */
1075                 rdata_crc = rechdr->xl_crc;
1076                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1077                 FIN_CRC32C(rdata_crc);
1078                 rechdr->xl_crc = rdata_crc;
1079
1080                 /*
1081                  * All the record data, including the header, is now ready to be
1082                  * inserted. Copy the record in the space reserved.
1083                  */
1084                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1085                                                         StartPos, EndPos);
1086
1087                 /*
1088                  * Unless record is flagged as not important, update LSN of last
1089                  * important record in the current slot. When holding all locks, just
1090                  * update the first one.
1091                  */
1092                 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1093                 {
1094                         int                     lockno = holdingAllLocks ? 0 : MyLockNo;
1095
1096                         WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1097                 }
1098         }
1099         else
1100         {
1101                 /*
1102                  * This was an xlog-switch record, but the current insert location was
1103                  * already exactly at the beginning of a segment, so there was no need
1104                  * to do anything.
1105                  */
1106         }
1107
1108         /*
1109          * Done! Let others know that we're finished.
1110          */
1111         WALInsertLockRelease();
1112
1113         MarkCurrentTransactionIdLoggedIfAny();
1114
1115         END_CRIT_SECTION();
1116
1117         /*
1118          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1119          */
1120         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1121         {
1122                 SpinLockAcquire(&XLogCtl->info_lck);
1123                 /* advance global request to include new block(s) */
1124                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1125                         XLogCtl->LogwrtRqst.Write = EndPos;
1126                 /* update local result copy while I have the chance */
1127                 LogwrtResult = XLogCtl->LogwrtResult;
1128                 SpinLockRelease(&XLogCtl->info_lck);
1129         }
1130
1131         /*
1132          * If this was an XLOG_SWITCH record, flush the record and the empty
1133          * padding space that fills the rest of the segment, and perform
1134          * end-of-segment actions (eg, notifying archiver).
1135          */
1136         if (isLogSwitch)
1137         {
1138                 TRACE_POSTGRESQL_WAL_SWITCH();
1139                 XLogFlush(EndPos);
1140
1141                 /*
1142                  * Even though we reserved the rest of the segment for us, which is
1143                  * reflected in EndPos, we return a pointer to just the end of the
1144                  * xlog-switch record.
1145                  */
1146                 if (inserted)
1147                 {
1148                         EndPos = StartPos + SizeOfXLogRecord;
1149                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1150                         {
1151                                 uint64          offset = XLogSegmentOffset(EndPos, wal_segment_size);
1152
1153                                 if (offset == EndPos % XLOG_BLCKSZ)
1154                                         EndPos += SizeOfXLogLongPHD;
1155                                 else
1156                                         EndPos += SizeOfXLogShortPHD;
1157                         }
1158                 }
1159         }
1160
1161 #ifdef WAL_DEBUG
1162         if (XLOG_DEBUG)
1163         {
1164                 static XLogReaderState *debug_reader = NULL;
1165                 StringInfoData buf;
1166                 StringInfoData recordBuf;
1167                 char       *errormsg = NULL;
1168                 MemoryContext oldCxt;
1169
1170                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1171
1172                 initStringInfo(&buf);
1173                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1174                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1175
1176                 /*
1177                  * We have to piece together the WAL record data from the XLogRecData
1178                  * entries, so that we can pass it to the rm_desc function as one
1179                  * contiguous chunk.
1180                  */
1181                 initStringInfo(&recordBuf);
1182                 for (; rdata != NULL; rdata = rdata->next)
1183                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1184
1185                 if (!debug_reader)
1186                         debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
1187
1188                 if (!debug_reader)
1189                 {
1190                         appendStringInfoString(&buf, "error decoding record: out of memory");
1191                 }
1192                 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1193                                                                    &errormsg))
1194                 {
1195                         appendStringInfo(&buf, "error decoding record: %s",
1196                                                          errormsg ? errormsg : "no error message");
1197                 }
1198                 else
1199                 {
1200                         appendStringInfoString(&buf, " - ");
1201                         xlog_outdesc(&buf, debug_reader);
1202                 }
1203                 elog(LOG, "%s", buf.data);
1204
1205                 pfree(buf.data);
1206                 pfree(recordBuf.data);
1207                 MemoryContextSwitchTo(oldCxt);
1208         }
1209 #endif
1210
1211         /*
1212          * Update our global variables
1213          */
1214         ProcLastRecPtr = StartPos;
1215         XactLastRecEnd = EndPos;
1216
1217         return EndPos;
1218 }
1219
1220 /*
1221  * Reserves the right amount of space for a record of given size from the WAL.
1222  * *StartPos is set to the beginning of the reserved section, *EndPos to
1223  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1224  * used to set the xl_prev of this record.
1225  *
1226  * This is the performance critical part of XLogInsert that must be serialized
1227  * across backends. The rest can happen mostly in parallel. Try to keep this
1228  * section as short as possible, insertpos_lck can be heavily contended on a
1229  * busy system.
1230  *
1231  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1232  * where we actually copy the record to the reserved space.
1233  */
1234 static void
1235 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1236                                                   XLogRecPtr *PrevPtr)
1237 {
1238         XLogCtlInsert *Insert = &XLogCtl->Insert;
1239         uint64          startbytepos;
1240         uint64          endbytepos;
1241         uint64          prevbytepos;
1242
1243         size = MAXALIGN(size);
1244
1245         /* All (non xlog-switch) records should contain data. */
1246         Assert(size > SizeOfXLogRecord);
1247
1248         /*
1249          * The duration the spinlock needs to be held is minimized by minimizing
1250          * the calculations that have to be done while holding the lock. The
1251          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1252          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1253          * page headers. The mapping between "usable" byte positions and physical
1254          * positions (XLogRecPtrs) can be done outside the locked region, and
1255          * because the usable byte position doesn't include any headers, reserving
1256          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1257          */
1258         SpinLockAcquire(&Insert->insertpos_lck);
1259
1260         startbytepos = Insert->CurrBytePos;
1261         endbytepos = startbytepos + size;
1262         prevbytepos = Insert->PrevBytePos;
1263         Insert->CurrBytePos = endbytepos;
1264         Insert->PrevBytePos = startbytepos;
1265
1266         SpinLockRelease(&Insert->insertpos_lck);
1267
1268         *StartPos = XLogBytePosToRecPtr(startbytepos);
1269         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1270         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1271
1272         /*
1273          * Check that the conversions between "usable byte positions" and
1274          * XLogRecPtrs work consistently in both directions.
1275          */
1276         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1277         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1278         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1279 }
1280
1281 /*
1282  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1283  *
1284  * A log-switch record is handled slightly differently. The rest of the
1285  * segment will be reserved for this insertion, as indicated by the returned
1286  * *EndPos value. However, if we are already at the beginning of the current
1287  * segment, *StartPos and *EndPos are set to the current location without
1288  * reserving any space, and the function returns false.
1289 */
1290 static bool
1291 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1292 {
1293         XLogCtlInsert *Insert = &XLogCtl->Insert;
1294         uint64          startbytepos;
1295         uint64          endbytepos;
1296         uint64          prevbytepos;
1297         uint32          size = MAXALIGN(SizeOfXLogRecord);
1298         XLogRecPtr      ptr;
1299         uint32          segleft;
1300
1301         /*
1302          * These calculations are a bit heavy-weight to be done while holding a
1303          * spinlock, but since we're holding all the WAL insertion locks, there
1304          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1305          * compete for it, but that's not called very frequently.
1306          */
1307         SpinLockAcquire(&Insert->insertpos_lck);
1308
1309         startbytepos = Insert->CurrBytePos;
1310
1311         ptr = XLogBytePosToEndRecPtr(startbytepos);
1312         if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1313         {
1314                 SpinLockRelease(&Insert->insertpos_lck);
1315                 *EndPos = *StartPos = ptr;
1316                 return false;
1317         }
1318
1319         endbytepos = startbytepos + size;
1320         prevbytepos = Insert->PrevBytePos;
1321
1322         *StartPos = XLogBytePosToRecPtr(startbytepos);
1323         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1324
1325         segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1326         if (segleft != wal_segment_size)
1327         {
1328                 /* consume the rest of the segment */
1329                 *EndPos += segleft;
1330                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1331         }
1332         Insert->CurrBytePos = endbytepos;
1333         Insert->PrevBytePos = startbytepos;
1334
1335         SpinLockRelease(&Insert->insertpos_lck);
1336
1337         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1338
1339         Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1340         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1341         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1342         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1343
1344         return true;
1345 }
1346
1347 /*
1348  * Checks whether the current buffer page and backup page stored in the
1349  * WAL record are consistent or not. Before comparing the two pages, a
1350  * masking can be applied to the pages to ignore certain areas like hint bits,
1351  * unused space between pd_lower and pd_upper among other things. This
1352  * function should be called once WAL replay has been completed for a
1353  * given record.
1354  */
1355 static void
1356 checkXLogConsistency(XLogReaderState *record)
1357 {
1358         RmgrId          rmid = XLogRecGetRmid(record);
1359         RelFileNode rnode;
1360         ForkNumber      forknum;
1361         BlockNumber blkno;
1362         int                     block_id;
1363
1364         /* Records with no backup blocks have no need for consistency checks. */
1365         if (!XLogRecHasAnyBlockRefs(record))
1366                 return;
1367
1368         Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1369
1370         for (block_id = 0; block_id <= record->max_block_id; block_id++)
1371         {
1372                 Buffer          buf;
1373                 Page            page;
1374
1375                 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1376                 {
1377                         /*
1378                          * WAL record doesn't contain a block reference with the given id.
1379                          * Do nothing.
1380                          */
1381                         continue;
1382                 }
1383
1384                 Assert(XLogRecHasBlockImage(record, block_id));
1385
1386                 if (XLogRecBlockImageApply(record, block_id))
1387                 {
1388                         /*
1389                          * WAL record has already applied the page, so bypass the
1390                          * consistency check as that would result in comparing the full
1391                          * page stored in the record with itself.
1392                          */
1393                         continue;
1394                 }
1395
1396                 /*
1397                  * Read the contents from the current buffer and store it in a
1398                  * temporary page.
1399                  */
1400                 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1401                                                                          RBM_NORMAL_NO_LOG);
1402                 if (!BufferIsValid(buf))
1403                         continue;
1404
1405                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1406                 page = BufferGetPage(buf);
1407
1408                 /*
1409                  * Take a copy of the local page where WAL has been applied to have a
1410                  * comparison base before masking it...
1411                  */
1412                 memcpy(replay_image_masked, page, BLCKSZ);
1413
1414                 /* No need for this page anymore now that a copy is in. */
1415                 UnlockReleaseBuffer(buf);
1416
1417                 /*
1418                  * If the block LSN is already ahead of this WAL record, we can't
1419                  * expect contents to match.  This can happen if recovery is
1420                  * restarted.
1421                  */
1422                 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1423                         continue;
1424
1425                 /*
1426                  * Read the contents from the backup copy, stored in WAL record and
1427                  * store it in a temporary page. There is no need to allocate a new
1428                  * page here, a local buffer is fine to hold its contents and a mask
1429                  * can be directly applied on it.
1430                  */
1431                 if (!RestoreBlockImage(record, block_id, master_image_masked))
1432                         elog(ERROR, "failed to restore block image");
1433
1434                 /*
1435                  * If masking function is defined, mask both the master and replay
1436                  * images
1437                  */
1438                 if (RmgrTable[rmid].rm_mask != NULL)
1439                 {
1440                         RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1441                         RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1442                 }
1443
1444                 /* Time to compare the master and replay images. */
1445                 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1446                 {
1447                         elog(FATAL,
1448                                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1449                                  rnode.spcNode, rnode.dbNode, rnode.relNode,
1450                                  forknum, blkno);
1451                 }
1452         }
1453 }
1454
1455 /*
1456  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1457  * area in the WAL.
1458  */
1459 static void
1460 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1461                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1462 {
1463         char       *currpos;
1464         int                     freespace;
1465         int                     written;
1466         XLogRecPtr      CurrPos;
1467         XLogPageHeader pagehdr;
1468
1469         /*
1470          * Get a pointer to the right place in the right WAL buffer to start
1471          * inserting to.
1472          */
1473         CurrPos = StartPos;
1474         currpos = GetXLogBuffer(CurrPos);
1475         freespace = INSERT_FREESPACE(CurrPos);
1476
1477         /*
1478          * there should be enough space for at least the first field (xl_tot_len)
1479          * on this page.
1480          */
1481         Assert(freespace >= sizeof(uint32));
1482
1483         /* Copy record data */
1484         written = 0;
1485         while (rdata != NULL)
1486         {
1487                 char       *rdata_data = rdata->data;
1488                 int                     rdata_len = rdata->len;
1489
1490                 while (rdata_len > freespace)
1491                 {
1492                         /*
1493                          * Write what fits on this page, and continue on the next page.
1494                          */
1495                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1496                         memcpy(currpos, rdata_data, freespace);
1497                         rdata_data += freespace;
1498                         rdata_len -= freespace;
1499                         written += freespace;
1500                         CurrPos += freespace;
1501
1502                         /*
1503                          * Get pointer to beginning of next page, and set the xlp_rem_len
1504                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1505                          *
1506                          * It's safe to set the contrecord flag and xlp_rem_len without a
1507                          * lock on the page. All the other flags were already set when the
1508                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1509                          * only backend that needs to set the contrecord flag.
1510                          */
1511                         currpos = GetXLogBuffer(CurrPos);
1512                         pagehdr = (XLogPageHeader) currpos;
1513                         pagehdr->xlp_rem_len = write_len - written;
1514                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1515
1516                         /* skip over the page header */
1517                         if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1518                         {
1519                                 CurrPos += SizeOfXLogLongPHD;
1520                                 currpos += SizeOfXLogLongPHD;
1521                         }
1522                         else
1523                         {
1524                                 CurrPos += SizeOfXLogShortPHD;
1525                                 currpos += SizeOfXLogShortPHD;
1526                         }
1527                         freespace = INSERT_FREESPACE(CurrPos);
1528                 }
1529
1530                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1531                 memcpy(currpos, rdata_data, rdata_len);
1532                 currpos += rdata_len;
1533                 CurrPos += rdata_len;
1534                 freespace -= rdata_len;
1535                 written += rdata_len;
1536
1537                 rdata = rdata->next;
1538         }
1539         Assert(written == write_len);
1540
1541         /*
1542          * If this was an xlog-switch, it's not enough to write the switch record,
1543          * we also have to consume all the remaining space in the WAL segment.  We
1544          * have already reserved that space, but we need to actually fill it.
1545          */
1546         if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1547         {
1548                 /* An xlog-switch record doesn't contain any data besides the header */
1549                 Assert(write_len == SizeOfXLogRecord);
1550
1551                 /* Assert that we did reserve the right amount of space */
1552                 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1553
1554                 /* Use up all the remaining space on the current page */
1555                 CurrPos += freespace;
1556
1557                 /*
1558                  * Cause all remaining pages in the segment to be flushed, leaving the
1559                  * XLog position where it should be, at the start of the next segment.
1560                  * We do this one page at a time, to make sure we don't deadlock
1561                  * against ourselves if wal_buffers < wal_segment_size.
1562                  */
1563                 while (CurrPos < EndPos)
1564                 {
1565                         /*
1566                          * The minimal action to flush the page would be to call
1567                          * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1568                          * AdvanceXLInsertBuffer(...).  The page would be left initialized
1569                          * mostly to zeros, except for the page header (always the short
1570                          * variant, as this is never a segment's first page).
1571                          *
1572                          * The large vistas of zeros are good for compressibility, but the
1573                          * headers interrupting them every XLOG_BLCKSZ (with values that
1574                          * differ from page to page) are not.  The effect varies with
1575                          * compression tool, but bzip2 for instance compresses about an
1576                          * order of magnitude worse if those headers are left in place.
1577                          *
1578                          * Rather than complicating AdvanceXLInsertBuffer itself (which is
1579                          * called in heavily-loaded circumstances as well as this lightly-
1580                          * loaded one) with variant behavior, we just use GetXLogBuffer
1581                          * (which itself calls the two methods we need) to get the pointer
1582                          * and zero most of the page.  Then we just zero the page header.
1583                          */
1584                         currpos = GetXLogBuffer(CurrPos);
1585                         MemSet(currpos, 0, SizeOfXLogShortPHD);
1586
1587                         CurrPos += XLOG_BLCKSZ;
1588                 }
1589         }
1590         else
1591         {
1592                 /* Align the end position, so that the next record starts aligned */
1593                 CurrPos = MAXALIGN64(CurrPos);
1594         }
1595
1596         if (CurrPos != EndPos)
1597                 elog(PANIC, "space reserved for WAL record does not match what was written");
1598 }
1599
1600 /*
1601  * Acquire a WAL insertion lock, for inserting to WAL.
1602  */
1603 static void
1604 WALInsertLockAcquire(void)
1605 {
1606         bool            immed;
1607
1608         /*
1609          * It doesn't matter which of the WAL insertion locks we acquire, so try
1610          * the one we used last time.  If the system isn't particularly busy, it's
1611          * a good bet that it's still available, and it's good to have some
1612          * affinity to a particular lock so that you don't unnecessarily bounce
1613          * cache lines between processes when there's no contention.
1614          *
1615          * If this is the first time through in this backend, pick a lock
1616          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1617          * lot of very short connections.
1618          */
1619         static int      lockToTry = -1;
1620
1621         if (lockToTry == -1)
1622                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1623         MyLockNo = lockToTry;
1624
1625         /*
1626          * The insertingAt value is initially set to 0, as we don't know our
1627          * insert location yet.
1628          */
1629         immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1630         if (!immed)
1631         {
1632                 /*
1633                  * If we couldn't get the lock immediately, try another lock next
1634                  * time.  On a system with more insertion locks than concurrent
1635                  * inserters, this causes all the inserters to eventually migrate to a
1636                  * lock that no-one else is using.  On a system with more inserters
1637                  * than locks, it still helps to distribute the inserters evenly
1638                  * across the locks.
1639                  */
1640                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1641         }
1642 }
1643
1644 /*
1645  * Acquire all WAL insertion locks, to prevent other backends from inserting
1646  * to WAL.
1647  */
1648 static void
1649 WALInsertLockAcquireExclusive(void)
1650 {
1651         int                     i;
1652
1653         /*
1654          * When holding all the locks, all but the last lock's insertingAt
1655          * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1656          * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1657          */
1658         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1659         {
1660                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1661                 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1662                                                 &WALInsertLocks[i].l.insertingAt,
1663                                                 PG_UINT64_MAX);
1664         }
1665         /* Variable value reset to 0 at release */
1666         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1667
1668         holdingAllLocks = true;
1669 }
1670
1671 /*
1672  * Release our insertion lock (or locks, if we're holding them all).
1673  *
1674  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1675  * next time the lock is acquired.
1676  */
1677 static void
1678 WALInsertLockRelease(void)
1679 {
1680         if (holdingAllLocks)
1681         {
1682                 int                     i;
1683
1684                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1685                         LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1686                                                                   &WALInsertLocks[i].l.insertingAt,
1687                                                                   0);
1688
1689                 holdingAllLocks = false;
1690         }
1691         else
1692         {
1693                 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1694                                                           &WALInsertLocks[MyLockNo].l.insertingAt,
1695                                                           0);
1696         }
1697 }
1698
1699 /*
1700  * Update our insertingAt value, to let others know that we've finished
1701  * inserting up to that point.
1702  */
1703 static void
1704 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1705 {
1706         if (holdingAllLocks)
1707         {
1708                 /*
1709                  * We use the last lock to mark our actual position, see comments in
1710                  * WALInsertLockAcquireExclusive.
1711                  */
1712                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1713                                                 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1714                                                 insertingAt);
1715         }
1716         else
1717                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1718                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1719                                                 insertingAt);
1720 }
1721
1722 /*
1723  * Wait for any WAL insertions < upto to finish.
1724  *
1725  * Returns the location of the oldest insertion that is still in-progress.
1726  * Any WAL prior to that point has been fully copied into WAL buffers, and
1727  * can be flushed out to disk. Because this waits for any insertions older
1728  * than 'upto' to finish, the return value is always >= 'upto'.
1729  *
1730  * Note: When you are about to write out WAL, you must call this function
1731  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1732  * need to wait for an insertion to finish (or at least advance to next
1733  * uninitialized page), and the inserter might need to evict an old WAL buffer
1734  * to make room for a new one, which in turn requires WALWriteLock.
1735  */
1736 static XLogRecPtr
1737 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1738 {
1739         uint64          bytepos;
1740         XLogRecPtr      reservedUpto;
1741         XLogRecPtr      finishedUpto;
1742         XLogCtlInsert *Insert = &XLogCtl->Insert;
1743         int                     i;
1744
1745         if (MyProc == NULL)
1746                 elog(PANIC, "cannot wait without a PGPROC structure");
1747
1748         /* Read the current insert position */
1749         SpinLockAcquire(&Insert->insertpos_lck);
1750         bytepos = Insert->CurrBytePos;
1751         SpinLockRelease(&Insert->insertpos_lck);
1752         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1753
1754         /*
1755          * No-one should request to flush a piece of WAL that hasn't even been
1756          * reserved yet. However, it can happen if there is a block with a bogus
1757          * LSN on disk, for example. XLogFlush checks for that situation and
1758          * complains, but only after the flush. Here we just assume that to mean
1759          * that all WAL that has been reserved needs to be finished. In this
1760          * corner-case, the return value can be smaller than 'upto' argument.
1761          */
1762         if (upto > reservedUpto)
1763         {
1764                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1765                          (uint32) (upto >> 32), (uint32) upto,
1766                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1767                 upto = reservedUpto;
1768         }
1769
1770         /*
1771          * Loop through all the locks, sleeping on any in-progress insert older
1772          * than 'upto'.
1773          *
1774          * finishedUpto is our return value, indicating the point upto which all
1775          * the WAL insertions have been finished. Initialize it to the head of
1776          * reserved WAL, and as we iterate through the insertion locks, back it
1777          * out for any insertion that's still in progress.
1778          */
1779         finishedUpto = reservedUpto;
1780         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1781         {
1782                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1783
1784                 do
1785                 {
1786                         /*
1787                          * See if this insertion is in progress. LWLockWait will wait for
1788                          * the lock to be released, or for the 'value' to be set by a
1789                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1790                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1791                          * where it's inserting yet.  We will have to wait for it.  If
1792                          * it's a small insertion, the record will most likely fit on the
1793                          * same page and the inserter will release the lock without ever
1794                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1795                          * advertise the insertion point with LWLockUpdateVar before
1796                          * sleeping.
1797                          */
1798                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1799                                                                  &WALInsertLocks[i].l.insertingAt,
1800                                                                  insertingat, &insertingat))
1801                         {
1802                                 /* the lock was free, so no insertion in progress */
1803                                 insertingat = InvalidXLogRecPtr;
1804                                 break;
1805                         }
1806
1807                         /*
1808                          * This insertion is still in progress. Have to wait, unless the
1809                          * inserter has proceeded past 'upto'.
1810                          */
1811                 } while (insertingat < upto);
1812
1813                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1814                         finishedUpto = insertingat;
1815         }
1816         return finishedUpto;
1817 }
1818
1819 /*
1820  * Get a pointer to the right location in the WAL buffer containing the
1821  * given XLogRecPtr.
1822  *
1823  * If the page is not initialized yet, it is initialized. That might require
1824  * evicting an old dirty buffer from the buffer cache, which means I/O.
1825  *
1826  * The caller must ensure that the page containing the requested location
1827  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1828  * hold onto a WAL insertion lock with the insertingAt position set to
1829  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1830  * to evict an old page from the buffer. (This means that once you call
1831  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1832  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1833  * later, because older buffers might be recycled already)
1834  */
1835 static char *
1836 GetXLogBuffer(XLogRecPtr ptr)
1837 {
1838         int                     idx;
1839         XLogRecPtr      endptr;
1840         static uint64 cachedPage = 0;
1841         static char *cachedPos = NULL;
1842         XLogRecPtr      expectedEndPtr;
1843
1844         /*
1845          * Fast path for the common case that we need to access again the same
1846          * page as last time.
1847          */
1848         if (ptr / XLOG_BLCKSZ == cachedPage)
1849         {
1850                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1851                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1852                 return cachedPos + ptr % XLOG_BLCKSZ;
1853         }
1854
1855         /*
1856          * The XLog buffer cache is organized so that a page is always loaded to a
1857          * particular buffer.  That way we can easily calculate the buffer a given
1858          * page must be loaded into, from the XLogRecPtr alone.
1859          */
1860         idx = XLogRecPtrToBufIdx(ptr);
1861
1862         /*
1863          * See what page is loaded in the buffer at the moment. It could be the
1864          * page we're looking for, or something older. It can't be anything newer
1865          * - that would imply the page we're looking for has already been written
1866          * out to disk and evicted, and the caller is responsible for making sure
1867          * that doesn't happen.
1868          *
1869          * However, we don't hold a lock while we read the value. If someone has
1870          * just initialized the page, it's possible that we get a "torn read" of
1871          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1872          * that case we will see a bogus value. That's ok, we'll grab the mapping
1873          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1874          * the page we're looking for. But it means that when we do this unlocked
1875          * read, we might see a value that appears to be ahead of the page we're
1876          * looking for. Don't PANIC on that, until we've verified the value while
1877          * holding the lock.
1878          */
1879         expectedEndPtr = ptr;
1880         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1881
1882         endptr = XLogCtl->xlblocks[idx];
1883         if (expectedEndPtr != endptr)
1884         {
1885                 XLogRecPtr      initializedUpto;
1886
1887                 /*
1888                  * Before calling AdvanceXLInsertBuffer(), which can block, let others
1889                  * know how far we're finished with inserting the record.
1890                  *
1891                  * NB: If 'ptr' points to just after the page header, advertise a
1892                  * position at the beginning of the page rather than 'ptr' itself. If
1893                  * there are no other insertions running, someone might try to flush
1894                  * up to our advertised location. If we advertised a position after
1895                  * the page header, someone might try to flush the page header, even
1896                  * though page might actually not be initialized yet. As the first
1897                  * inserter on the page, we are effectively responsible for making
1898                  * sure that it's initialized, before we let insertingAt to move past
1899                  * the page header.
1900                  */
1901                 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1902                         XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1903                         initializedUpto = ptr - SizeOfXLogShortPHD;
1904                 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1905                                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1906                         initializedUpto = ptr - SizeOfXLogLongPHD;
1907                 else
1908                         initializedUpto = ptr;
1909
1910                 WALInsertLockUpdateInsertingAt(initializedUpto);
1911
1912                 AdvanceXLInsertBuffer(ptr, false);
1913                 endptr = XLogCtl->xlblocks[idx];
1914
1915                 if (expectedEndPtr != endptr)
1916                         elog(PANIC, "could not find WAL buffer for %X/%X",
1917                                  (uint32) (ptr >> 32), (uint32) ptr);
1918         }
1919         else
1920         {
1921                 /*
1922                  * Make sure the initialization of the page is visible to us, and
1923                  * won't arrive later to overwrite the WAL data we write on the page.
1924                  */
1925                 pg_memory_barrier();
1926         }
1927
1928         /*
1929          * Found the buffer holding this page. Return a pointer to the right
1930          * offset within the page.
1931          */
1932         cachedPage = ptr / XLOG_BLCKSZ;
1933         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1934
1935         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1936         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1937
1938         return cachedPos + ptr % XLOG_BLCKSZ;
1939 }
1940
1941 /*
1942  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1943  * is the position starting from the beginning of WAL, excluding all WAL
1944  * page headers.
1945  */
1946 static XLogRecPtr
1947 XLogBytePosToRecPtr(uint64 bytepos)
1948 {
1949         uint64          fullsegs;
1950         uint64          fullpages;
1951         uint64          bytesleft;
1952         uint32          seg_offset;
1953         XLogRecPtr      result;
1954
1955         fullsegs = bytepos / UsableBytesInSegment;
1956         bytesleft = bytepos % UsableBytesInSegment;
1957
1958         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1959         {
1960                 /* fits on first page of segment */
1961                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1962         }
1963         else
1964         {
1965                 /* account for the first page on segment with long header */
1966                 seg_offset = XLOG_BLCKSZ;
1967                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1968
1969                 fullpages = bytesleft / UsableBytesInPage;
1970                 bytesleft = bytesleft % UsableBytesInPage;
1971
1972                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1973         }
1974
1975         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1976
1977         return result;
1978 }
1979
1980 /*
1981  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1982  * returns a pointer to the beginning of the page (ie. before page header),
1983  * not to where the first xlog record on that page would go to. This is used
1984  * when converting a pointer to the end of a record.
1985  */
1986 static XLogRecPtr
1987 XLogBytePosToEndRecPtr(uint64 bytepos)
1988 {
1989         uint64          fullsegs;
1990         uint64          fullpages;
1991         uint64          bytesleft;
1992         uint32          seg_offset;
1993         XLogRecPtr      result;
1994
1995         fullsegs = bytepos / UsableBytesInSegment;
1996         bytesleft = bytepos % UsableBytesInSegment;
1997
1998         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1999         {
2000                 /* fits on first page of segment */
2001                 if (bytesleft == 0)
2002                         seg_offset = 0;
2003                 else
2004                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2005         }
2006         else
2007         {
2008                 /* account for the first page on segment with long header */
2009                 seg_offset = XLOG_BLCKSZ;
2010                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2011
2012                 fullpages = bytesleft / UsableBytesInPage;
2013                 bytesleft = bytesleft % UsableBytesInPage;
2014
2015                 if (bytesleft == 0)
2016                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2017                 else
2018                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2019         }
2020
2021         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2022
2023         return result;
2024 }
2025
2026 /*
2027  * Convert an XLogRecPtr to a "usable byte position".
2028  */
2029 static uint64
2030 XLogRecPtrToBytePos(XLogRecPtr ptr)
2031 {
2032         uint64          fullsegs;
2033         uint32          fullpages;
2034         uint32          offset;
2035         uint64          result;
2036
2037         XLByteToSeg(ptr, fullsegs, wal_segment_size);
2038
2039         fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2040         offset = ptr % XLOG_BLCKSZ;
2041
2042         if (fullpages == 0)
2043         {
2044                 result = fullsegs * UsableBytesInSegment;
2045                 if (offset > 0)
2046                 {
2047                         Assert(offset >= SizeOfXLogLongPHD);
2048                         result += offset - SizeOfXLogLongPHD;
2049                 }
2050         }
2051         else
2052         {
2053                 result = fullsegs * UsableBytesInSegment +
2054                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2055                         (fullpages - 1) * UsableBytesInPage;    /* full pages */
2056                 if (offset > 0)
2057                 {
2058                         Assert(offset >= SizeOfXLogShortPHD);
2059                         result += offset - SizeOfXLogShortPHD;
2060                 }
2061         }
2062
2063         return result;
2064 }
2065
2066 /*
2067  * Initialize XLOG buffers, writing out old buffers if they still contain
2068  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2069  * true, initialize as many pages as we can without having to write out
2070  * unwritten data. Any new pages are initialized to zeros, with pages headers
2071  * initialized properly.
2072  */
2073 static void
2074 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2075 {
2076         XLogCtlInsert *Insert = &XLogCtl->Insert;
2077         int                     nextidx;
2078         XLogRecPtr      OldPageRqstPtr;
2079         XLogwrtRqst WriteRqst;
2080         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2081         XLogRecPtr      NewPageBeginPtr;
2082         XLogPageHeader NewPage;
2083         int                     npages = 0;
2084
2085         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2086
2087         /*
2088          * Now that we have the lock, check if someone initialized the page
2089          * already.
2090          */
2091         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2092         {
2093                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2094
2095                 /*
2096                  * Get ending-offset of the buffer page we need to replace (this may
2097                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2098                  * already written out.
2099                  */
2100                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2101                 if (LogwrtResult.Write < OldPageRqstPtr)
2102                 {
2103                         /*
2104                          * Nope, got work to do. If we just want to pre-initialize as much
2105                          * as we can without flushing, give up now.
2106                          */
2107                         if (opportunistic)
2108                                 break;
2109
2110                         /* Before waiting, get info_lck and update LogwrtResult */
2111                         SpinLockAcquire(&XLogCtl->info_lck);
2112                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2113                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2114                         LogwrtResult = XLogCtl->LogwrtResult;
2115                         SpinLockRelease(&XLogCtl->info_lck);
2116
2117                         /*
2118                          * Now that we have an up-to-date LogwrtResult value, see if we
2119                          * still need to write it or if someone else already did.
2120                          */
2121                         if (LogwrtResult.Write < OldPageRqstPtr)
2122                         {
2123                                 /*
2124                                  * Must acquire write lock. Release WALBufMappingLock first,
2125                                  * to make sure that all insertions that we need to wait for
2126                                  * can finish (up to this same position). Otherwise we risk
2127                                  * deadlock.
2128                                  */
2129                                 LWLockRelease(WALBufMappingLock);
2130
2131                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2132
2133                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2134
2135                                 LogwrtResult = XLogCtl->LogwrtResult;
2136                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2137                                 {
2138                                         /* OK, someone wrote it already */
2139                                         LWLockRelease(WALWriteLock);
2140                                 }
2141                                 else
2142                                 {
2143                                         /* Have to write it ourselves */
2144                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2145                                         WriteRqst.Write = OldPageRqstPtr;
2146                                         WriteRqst.Flush = 0;
2147                                         XLogWrite(WriteRqst, false);
2148                                         LWLockRelease(WALWriteLock);
2149                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2150                                 }
2151                                 /* Re-acquire WALBufMappingLock and retry */
2152                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2153                                 continue;
2154                         }
2155                 }
2156
2157                 /*
2158                  * Now the next buffer slot is free and we can set it up to be the
2159                  * next output page.
2160                  */
2161                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2162                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2163
2164                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2165
2166                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2167
2168                 /*
2169                  * Be sure to re-zero the buffer so that bytes beyond what we've
2170                  * written will look like zeroes and not valid XLOG records...
2171                  */
2172                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2173
2174                 /*
2175                  * Fill the new page's header
2176                  */
2177                 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2178
2179                 /* NewPage->xlp_info = 0; */    /* done by memset */
2180                 NewPage->xlp_tli = ThisTimeLineID;
2181                 NewPage->xlp_pageaddr = NewPageBeginPtr;
2182
2183                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2184
2185                 /*
2186                  * If online backup is not in progress, mark the header to indicate
2187                  * that WAL records beginning in this page have removable backup
2188                  * blocks.  This allows the WAL archiver to know whether it is safe to
2189                  * compress archived WAL data by transforming full-block records into
2190                  * the non-full-block format.  It is sufficient to record this at the
2191                  * page level because we force a page switch (in fact a segment
2192                  * switch) when starting a backup, so the flag will be off before any
2193                  * records can be written during the backup.  At the end of a backup,
2194                  * the last page will be marked as all unsafe when perhaps only part
2195                  * is unsafe, but at worst the archiver would miss the opportunity to
2196                  * compress a few records.
2197                  */
2198                 if (!Insert->forcePageWrites)
2199                         NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2200
2201                 /*
2202                  * If first page of an XLOG segment file, make it a long header.
2203                  */
2204                 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2205                 {
2206                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2207
2208                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2209                         NewLongPage->xlp_seg_size = wal_segment_size;
2210                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2211                         NewPage->xlp_info |= XLP_LONG_HEADER;
2212                 }
2213
2214                 /*
2215                  * Make sure the initialization of the page becomes visible to others
2216                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2217                  * holding a lock.
2218                  */
2219                 pg_write_barrier();
2220
2221                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2222
2223                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2224
2225                 npages++;
2226         }
2227         LWLockRelease(WALBufMappingLock);
2228
2229 #ifdef WAL_DEBUG
2230         if (XLOG_DEBUG && npages > 0)
2231         {
2232                 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2233                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2234         }
2235 #endif
2236 }
2237
2238 /*
2239  * Calculate CheckPointSegments based on max_wal_size_mb and
2240  * checkpoint_completion_target.
2241  */
2242 static void
2243 CalculateCheckpointSegments(void)
2244 {
2245         double          target;
2246
2247         /*-------
2248          * Calculate the distance at which to trigger a checkpoint, to avoid
2249          * exceeding max_wal_size_mb. This is based on two assumptions:
2250          *
2251          * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2252          *    WAL for two checkpoint cycles to allow us to recover from the
2253          *    secondary checkpoint if the first checkpoint failed, though we
2254          *    only did this on the master anyway, not on standby. Keeping just
2255          *    one checkpoint simplifies processing and reduces disk space in
2256          *    many smaller databases.)
2257          * b) during checkpoint, we consume checkpoint_completion_target *
2258          *        number of segments consumed between checkpoints.
2259          *-------
2260          */
2261         target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2262                 (1.0 + CheckPointCompletionTarget);
2263
2264         /* round down */
2265         CheckPointSegments = (int) target;
2266
2267         if (CheckPointSegments < 1)
2268                 CheckPointSegments = 1;
2269 }
2270
2271 void
2272 assign_max_wal_size(int newval, void *extra)
2273 {
2274         max_wal_size_mb = newval;
2275         CalculateCheckpointSegments();
2276 }
2277
2278 void
2279 assign_checkpoint_completion_target(double newval, void *extra)
2280 {
2281         CheckPointCompletionTarget = newval;
2282         CalculateCheckpointSegments();
2283 }
2284
2285 /*
2286  * At a checkpoint, how many WAL segments to recycle as preallocated future
2287  * XLOG segments? Returns the highest segment that should be preallocated.
2288  */
2289 static XLogSegNo
2290 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2291 {
2292         XLogSegNo       minSegNo;
2293         XLogSegNo       maxSegNo;
2294         double          distance;
2295         XLogSegNo       recycleSegNo;
2296
2297         /*
2298          * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2299          * correspond to. Always recycle enough segments to meet the minimum, and
2300          * remove enough segments to stay below the maximum.
2301          */
2302         minSegNo = PriorRedoPtr / wal_segment_size +
2303                 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2304         maxSegNo = PriorRedoPtr / wal_segment_size +
2305                 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2306
2307         /*
2308          * Between those limits, recycle enough segments to get us through to the
2309          * estimated end of next checkpoint.
2310          *
2311          * To estimate where the next checkpoint will finish, assume that the
2312          * system runs steadily consuming CheckPointDistanceEstimate bytes between
2313          * every checkpoint.
2314          */
2315         distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2316         /* add 10% for good measure. */
2317         distance *= 1.10;
2318
2319         recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) /
2320                                                                         wal_segment_size);
2321
2322         if (recycleSegNo < minSegNo)
2323                 recycleSegNo = minSegNo;
2324         if (recycleSegNo > maxSegNo)
2325                 recycleSegNo = maxSegNo;
2326
2327         return recycleSegNo;
2328 }
2329
2330 /*
2331  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2332  *
2333  * new_segno indicates a log file that has just been filled up (or read
2334  * during recovery). We measure the distance from RedoRecPtr to new_segno
2335  * and see if that exceeds CheckPointSegments.
2336  *
2337  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2338  */
2339 static bool
2340 XLogCheckpointNeeded(XLogSegNo new_segno)
2341 {
2342         XLogSegNo       old_segno;
2343
2344         XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2345
2346         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2347                 return true;
2348         return false;
2349 }
2350
2351 /*
2352  * Write and/or fsync the log at least as far as WriteRqst indicates.
2353  *
2354  * If flexible == true, we don't have to write as far as WriteRqst, but
2355  * may stop at any convenient boundary (such as a cache or logfile boundary).
2356  * This option allows us to avoid uselessly issuing multiple writes when a
2357  * single one would do.
2358  *
2359  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2360  * must be called before grabbing the lock, to make sure the data is ready to
2361  * write.
2362  */
2363 static void
2364 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2365 {
2366         bool            ispartialpage;
2367         bool            last_iteration;
2368         bool            finishing_seg;
2369         bool            use_existent;
2370         int                     curridx;
2371         int                     npages;
2372         int                     startidx;
2373         uint32          startoffset;
2374
2375         /* We should always be inside a critical section here */
2376         Assert(CritSectionCount > 0);
2377
2378         /*
2379          * Update local LogwrtResult (caller probably did this already, but...)
2380          */
2381         LogwrtResult = XLogCtl->LogwrtResult;
2382
2383         /*
2384          * Since successive pages in the xlog cache are consecutively allocated,
2385          * we can usually gather multiple pages together and issue just one
2386          * write() call.  npages is the number of pages we have determined can be
2387          * written together; startidx is the cache block index of the first one,
2388          * and startoffset is the file offset at which it should go. The latter
2389          * two variables are only valid when npages > 0, but we must initialize
2390          * all of them to keep the compiler quiet.
2391          */
2392         npages = 0;
2393         startidx = 0;
2394         startoffset = 0;
2395
2396         /*
2397          * Within the loop, curridx is the cache block index of the page to
2398          * consider writing.  Begin at the buffer containing the next unwritten
2399          * page, or last partially written page.
2400          */
2401         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2402
2403         while (LogwrtResult.Write < WriteRqst.Write)
2404         {
2405                 /*
2406                  * Make sure we're not ahead of the insert process.  This could happen
2407                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2408                  * last page that's been initialized by AdvanceXLInsertBuffer.
2409                  */
2410                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2411
2412                 if (LogwrtResult.Write >= EndPtr)
2413                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2414                                  (uint32) (LogwrtResult.Write >> 32),
2415                                  (uint32) LogwrtResult.Write,
2416                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2417
2418                 /* Advance LogwrtResult.Write to end of current buffer page */
2419                 LogwrtResult.Write = EndPtr;
2420                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2421
2422                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2423                                                          wal_segment_size))
2424                 {
2425                         /*
2426                          * Switch to new logfile segment.  We cannot have any pending
2427                          * pages here (since we dump what we have at segment end).
2428                          */
2429                         Assert(npages == 0);
2430                         if (openLogFile >= 0)
2431                                 XLogFileClose();
2432                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2433                                                         wal_segment_size);
2434
2435                         /* create/use new log file */
2436                         use_existent = true;
2437                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2438                         openLogOff = 0;
2439                 }
2440
2441                 /* Make sure we have the current logfile open */
2442                 if (openLogFile < 0)
2443                 {
2444                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2445                                                         wal_segment_size);
2446                         openLogFile = XLogFileOpen(openLogSegNo);
2447                         openLogOff = 0;
2448                 }
2449
2450                 /* Add current page to the set of pending pages-to-dump */
2451                 if (npages == 0)
2452                 {
2453                         /* first of group */
2454                         startidx = curridx;
2455                         startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2456                                                                                         wal_segment_size);
2457                 }
2458                 npages++;
2459
2460                 /*
2461                  * Dump the set if this will be the last loop iteration, or if we are
2462                  * at the last page of the cache area (since the next page won't be
2463                  * contiguous in memory), or if we are at the end of the logfile
2464                  * segment.
2465                  */
2466                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2467
2468                 finishing_seg = !ispartialpage &&
2469                         (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2470
2471                 if (last_iteration ||
2472                         curridx == XLogCtl->XLogCacheBlck ||
2473                         finishing_seg)
2474                 {
2475                         char       *from;
2476                         Size            nbytes;
2477                         Size            nleft;
2478                         int                     written;
2479
2480                         /* Need to seek in the file? */
2481                         if (openLogOff != startoffset)
2482                         {
2483                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2484                                         ereport(PANIC,
2485                                                         (errcode_for_file_access(),
2486                                                          errmsg("could not seek in log file %s to offset %u: %m",
2487                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2488                                                                         startoffset)));
2489                                 openLogOff = startoffset;
2490                         }
2491
2492                         /* OK to write the page(s) */
2493                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2494                         nbytes = npages * (Size) XLOG_BLCKSZ;
2495                         nleft = nbytes;
2496                         do
2497                         {
2498                                 errno = 0;
2499                                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2500                                 written = write(openLogFile, from, nleft);
2501                                 pgstat_report_wait_end();
2502                                 if (written <= 0)
2503                                 {
2504                                         if (errno == EINTR)
2505                                                 continue;
2506                                         ereport(PANIC,
2507                                                         (errcode_for_file_access(),
2508                                                          errmsg("could not write to log file %s "
2509                                                                         "at offset %u, length %zu: %m",
2510                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2511                                                                         openLogOff, nbytes)));
2512                                 }
2513                                 nleft -= written;
2514                                 from += written;
2515                         } while (nleft > 0);
2516
2517                         /* Update state for write */
2518                         openLogOff += nbytes;
2519                         npages = 0;
2520
2521                         /*
2522                          * If we just wrote the whole last page of a logfile segment,
2523                          * fsync the segment immediately.  This avoids having to go back
2524                          * and re-open prior segments when an fsync request comes along
2525                          * later. Doing it here ensures that one and only one backend will
2526                          * perform this fsync.
2527                          *
2528                          * This is also the right place to notify the Archiver that the
2529                          * segment is ready to copy to archival storage, and to update the
2530                          * timer for archive_timeout, and to signal for a checkpoint if
2531                          * too many logfile segments have been used since the last
2532                          * checkpoint.
2533                          */
2534                         if (finishing_seg)
2535                         {
2536                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2537
2538                                 /* signal that we need to wakeup walsenders later */
2539                                 WalSndWakeupRequest();
2540
2541                                 LogwrtResult.Flush = LogwrtResult.Write;        /* end of page */
2542
2543                                 if (XLogArchivingActive())
2544                                         XLogArchiveNotifySeg(openLogSegNo);
2545
2546                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2547                                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2548
2549                                 /*
2550                                  * Request a checkpoint if we've consumed too much xlog since
2551                                  * the last one.  For speed, we first check using the local
2552                                  * copy of RedoRecPtr, which might be out of date; if it looks
2553                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2554                                  * recheck.
2555                                  */
2556                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2557                                 {
2558                                         (void) GetRedoRecPtr();
2559                                         if (XLogCheckpointNeeded(openLogSegNo))
2560                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2561                                 }
2562                         }
2563                 }
2564
2565                 if (ispartialpage)
2566                 {
2567                         /* Only asked to write a partial page */
2568                         LogwrtResult.Write = WriteRqst.Write;
2569                         break;
2570                 }
2571                 curridx = NextBufIdx(curridx);
2572
2573                 /* If flexible, break out of loop as soon as we wrote something */
2574                 if (flexible && npages == 0)
2575                         break;
2576         }
2577
2578         Assert(npages == 0);
2579
2580         /*
2581          * If asked to flush, do so
2582          */
2583         if (LogwrtResult.Flush < WriteRqst.Flush &&
2584                 LogwrtResult.Flush < LogwrtResult.Write)
2585
2586         {
2587                 /*
2588                  * Could get here without iterating above loop, in which case we might
2589                  * have no open file or the wrong one.  However, we do not need to
2590                  * fsync more than one file.
2591                  */
2592                 if (sync_method != SYNC_METHOD_OPEN &&
2593                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2594                 {
2595                         if (openLogFile >= 0 &&
2596                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2597                                                                  wal_segment_size))
2598                                 XLogFileClose();
2599                         if (openLogFile < 0)
2600                         {
2601                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2602                                                                 wal_segment_size);
2603                                 openLogFile = XLogFileOpen(openLogSegNo);
2604                                 openLogOff = 0;
2605                         }
2606
2607                         issue_xlog_fsync(openLogFile, openLogSegNo);
2608                 }
2609
2610                 /* signal that we need to wakeup walsenders later */
2611                 WalSndWakeupRequest();
2612
2613                 LogwrtResult.Flush = LogwrtResult.Write;
2614         }
2615
2616         /*
2617          * Update shared-memory status
2618          *
2619          * We make sure that the shared 'request' values do not fall behind the
2620          * 'result' values.  This is not absolutely essential, but it saves some
2621          * code in a couple of places.
2622          */
2623         {
2624                 SpinLockAcquire(&XLogCtl->info_lck);
2625                 XLogCtl->LogwrtResult = LogwrtResult;
2626                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2627                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2628                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2629                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2630                 SpinLockRelease(&XLogCtl->info_lck);
2631         }
2632 }
2633
2634 /*
2635  * Record the LSN for an asynchronous transaction commit/abort
2636  * and nudge the WALWriter if there is work for it to do.
2637  * (This should not be called for synchronous commits.)
2638  */
2639 void
2640 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2641 {
2642         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2643         bool            sleeping;
2644
2645         SpinLockAcquire(&XLogCtl->info_lck);
2646         LogwrtResult = XLogCtl->LogwrtResult;
2647         sleeping = XLogCtl->WalWriterSleeping;
2648         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2649                 XLogCtl->asyncXactLSN = asyncXactLSN;
2650         SpinLockRelease(&XLogCtl->info_lck);
2651
2652         /*
2653          * If the WALWriter is sleeping, we should kick it to make it come out of
2654          * low-power mode.  Otherwise, determine whether there's a full page of
2655          * WAL available to write.
2656          */
2657         if (!sleeping)
2658         {
2659                 /* back off to last completed page boundary */
2660                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2661
2662                 /* if we have already flushed that far, we're done */
2663                 if (WriteRqstPtr <= LogwrtResult.Flush)
2664                         return;
2665         }
2666
2667         /*
2668          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2669          * to come out of low-power mode so that this async commit will reach disk
2670          * within the expected amount of time.
2671          */
2672         if (ProcGlobal->walwriterLatch)
2673                 SetLatch(ProcGlobal->walwriterLatch);
2674 }
2675
2676 /*
2677  * Record the LSN up to which we can remove WAL because it's not required by
2678  * any replication slot.
2679  */
2680 void
2681 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2682 {
2683         SpinLockAcquire(&XLogCtl->info_lck);
2684         XLogCtl->replicationSlotMinLSN = lsn;
2685         SpinLockRelease(&XLogCtl->info_lck);
2686 }
2687
2688
2689 /*
2690  * Return the oldest LSN we must retain to satisfy the needs of some
2691  * replication slot.
2692  */
2693 static XLogRecPtr
2694 XLogGetReplicationSlotMinimumLSN(void)
2695 {
2696         XLogRecPtr      retval;
2697
2698         SpinLockAcquire(&XLogCtl->info_lck);
2699         retval = XLogCtl->replicationSlotMinLSN;
2700         SpinLockRelease(&XLogCtl->info_lck);
2701
2702         return retval;
2703 }
2704
2705 /*
2706  * Advance minRecoveryPoint in control file.
2707  *
2708  * If we crash during recovery, we must reach this point again before the
2709  * database is consistent.
2710  *
2711  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2712  * is only updated if it's not already greater than or equal to 'lsn'.
2713  */
2714 static void
2715 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2716 {
2717         /* Quick check using our local copy of the variable */
2718         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2719                 return;
2720
2721         /*
2722          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2723          * i.e., we're doing crash recovery.  We never modify the control file's
2724          * value in that case, so we can short-circuit future checks here too. The
2725          * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2726          * updated until crash recovery finishes.
2727          */
2728         if (XLogRecPtrIsInvalid(minRecoveryPoint))
2729         {
2730                 updateMinRecoveryPoint = false;
2731                 return;
2732         }
2733
2734         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2735
2736         /* update local copy */
2737         minRecoveryPoint = ControlFile->minRecoveryPoint;
2738         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2739
2740         if (force || minRecoveryPoint < lsn)
2741         {
2742                 XLogRecPtr      newMinRecoveryPoint;
2743                 TimeLineID      newMinRecoveryPointTLI;
2744
2745                 /*
2746                  * To avoid having to update the control file too often, we update it
2747                  * all the way to the last record being replayed, even though 'lsn'
2748                  * would suffice for correctness.  This also allows the 'force' case
2749                  * to not need a valid 'lsn' value.
2750                  *
2751                  * Another important reason for doing it this way is that the passed
2752                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2753                  * the caller got it from a corrupted heap page.  Accepting such a
2754                  * value as the min recovery point would prevent us from coming up at
2755                  * all.  Instead, we just log a warning and continue with recovery.
2756                  * (See also the comments about corrupt LSNs in XLogFlush.)
2757                  */
2758                 SpinLockAcquire(&XLogCtl->info_lck);
2759                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2760                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2761                 SpinLockRelease(&XLogCtl->info_lck);
2762
2763                 if (!force && newMinRecoveryPoint < lsn)
2764                         elog(WARNING,
2765                                  "xlog min recovery request %X/%X is past current point %X/%X",
2766                                  (uint32) (lsn >> 32), (uint32) lsn,
2767                                  (uint32) (newMinRecoveryPoint >> 32),
2768                                  (uint32) newMinRecoveryPoint);
2769
2770                 /* update control file */
2771                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2772                 {
2773                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2774                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2775                         UpdateControlFile();
2776                         minRecoveryPoint = newMinRecoveryPoint;
2777                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2778
2779                         ereport(DEBUG2,
2780                                         (errmsg("updated min recovery point to %X/%X on timeline %u",
2781                                                         (uint32) (minRecoveryPoint >> 32),
2782                                                         (uint32) minRecoveryPoint,
2783                                                         newMinRecoveryPointTLI)));
2784                 }
2785         }
2786         LWLockRelease(ControlFileLock);
2787 }
2788
2789 /*
2790  * Ensure that all XLOG data through the given position is flushed to disk.
2791  *
2792  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2793  * already held, and we try to avoid acquiring it if possible.
2794  */
2795 void
2796 XLogFlush(XLogRecPtr record)
2797 {
2798         XLogRecPtr      WriteRqstPtr;
2799         XLogwrtRqst WriteRqst;
2800
2801         /*
2802          * During REDO, we are reading not writing WAL.  Therefore, instead of
2803          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2804          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2805          * to act this way too, and because when it tries to write the
2806          * end-of-recovery checkpoint, it should indeed flush.
2807          */
2808         if (!XLogInsertAllowed())
2809         {
2810                 UpdateMinRecoveryPoint(record, false);
2811                 return;
2812         }
2813
2814         /* Quick exit if already known flushed */
2815         if (record <= LogwrtResult.Flush)
2816                 return;
2817
2818 #ifdef WAL_DEBUG
2819         if (XLOG_DEBUG)
2820                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2821                          (uint32) (record >> 32), (uint32) record,
2822                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2823                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2824 #endif
2825
2826         START_CRIT_SECTION();
2827
2828         /*
2829          * Since fsync is usually a horribly expensive operation, we try to
2830          * piggyback as much data as we can on each fsync: if we see any more data
2831          * entered into the xlog buffer, we'll write and fsync that too, so that
2832          * the final value of LogwrtResult.Flush is as large as possible. This
2833          * gives us some chance of avoiding another fsync immediately after.
2834          */
2835
2836         /* initialize to given target; may increase below */
2837         WriteRqstPtr = record;
2838
2839         /*
2840          * Now wait until we get the write lock, or someone else does the flush
2841          * for us.
2842          */
2843         for (;;)
2844         {
2845                 XLogRecPtr      insertpos;
2846
2847                 /* read LogwrtResult and update local state */
2848                 SpinLockAcquire(&XLogCtl->info_lck);
2849                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2850                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2851                 LogwrtResult = XLogCtl->LogwrtResult;
2852                 SpinLockRelease(&XLogCtl->info_lck);
2853
2854                 /* done already? */
2855                 if (record <= LogwrtResult.Flush)
2856                         break;
2857
2858                 /*
2859                  * Before actually performing the write, wait for all in-flight
2860                  * insertions to the pages we're about to write to finish.
2861                  */
2862                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2863
2864                 /*
2865                  * Try to get the write lock. If we can't get it immediately, wait
2866                  * until it's released, and recheck if we still need to do the flush
2867                  * or if the backend that held the lock did it for us already. This
2868                  * helps to maintain a good rate of group committing when the system
2869                  * is bottlenecked by the speed of fsyncing.
2870                  */
2871                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2872                 {
2873                         /*
2874                          * The lock is now free, but we didn't acquire it yet. Before we
2875                          * do, loop back to check if someone else flushed the record for
2876                          * us already.
2877                          */
2878                         continue;
2879                 }
2880
2881                 /* Got the lock; recheck whether request is satisfied */
2882                 LogwrtResult = XLogCtl->LogwrtResult;
2883                 if (record <= LogwrtResult.Flush)
2884                 {
2885                         LWLockRelease(WALWriteLock);
2886                         break;
2887                 }
2888
2889                 /*
2890                  * Sleep before flush! By adding a delay here, we may give further
2891                  * backends the opportunity to join the backlog of group commit
2892                  * followers; this can significantly improve transaction throughput,
2893                  * at the risk of increasing transaction latency.
2894                  *
2895                  * We do not sleep if enableFsync is not turned on, nor if there are
2896                  * fewer than CommitSiblings other backends with active transactions.
2897                  */
2898                 if (CommitDelay > 0 && enableFsync &&
2899                         MinimumActiveBackends(CommitSiblings))
2900                 {
2901                         pg_usleep(CommitDelay);
2902
2903                         /*
2904                          * Re-check how far we can now flush the WAL. It's generally not
2905                          * safe to call WaitXLogInsertionsToFinish while holding
2906                          * WALWriteLock, because an in-progress insertion might need to
2907                          * also grab WALWriteLock to make progress. But we know that all
2908                          * the insertions up to insertpos have already finished, because
2909                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2910                          * We're only calling it again to allow insertpos to be moved
2911                          * further forward, not to actually wait for anyone.
2912                          */
2913                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2914                 }
2915
2916                 /* try to write/flush later additions to XLOG as well */
2917                 WriteRqst.Write = insertpos;
2918                 WriteRqst.Flush = insertpos;
2919
2920                 XLogWrite(WriteRqst, false);
2921
2922                 LWLockRelease(WALWriteLock);
2923                 /* done */
2924                 break;
2925         }
2926
2927         END_CRIT_SECTION();
2928
2929         /* wake up walsenders now that we've released heavily contended locks */
2930         WalSndWakeupProcessRequests();
2931
2932         /*
2933          * If we still haven't flushed to the request point then we have a
2934          * problem; most likely, the requested flush point is past end of XLOG.
2935          * This has been seen to occur when a disk page has a corrupted LSN.
2936          *
2937          * Formerly we treated this as a PANIC condition, but that hurts the
2938          * system's robustness rather than helping it: we do not want to take down
2939          * the whole system due to corruption on one data page.  In particular, if
2940          * the bad page is encountered again during recovery then we would be
2941          * unable to restart the database at all!  (This scenario actually
2942          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2943          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2944          * the only time we can reach here during recovery is while flushing the
2945          * end-of-recovery checkpoint record, and we don't expect that to have a
2946          * bad LSN.
2947          *
2948          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2949          * since xact.c calls this routine inside a critical section.  However,
2950          * calls from bufmgr.c are not within critical sections and so we will not
2951          * force a restart for a bad LSN on a data page.
2952          */
2953         if (LogwrtResult.Flush < record)
2954                 elog(ERROR,
2955                          "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2956                          (uint32) (record >> 32), (uint32) record,
2957                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2958 }
2959
2960 /*
2961  * Write & flush xlog, but without specifying exactly where to.
2962  *
2963  * We normally write only completed blocks; but if there is nothing to do on
2964  * that basis, we check for unwritten async commits in the current incomplete
2965  * block, and write through the latest one of those.  Thus, if async commits
2966  * are not being used, we will write complete blocks only.
2967  *
2968  * If, based on the above, there's anything to write we do so immediately. But
2969  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2970  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2971  * more than wal_writer_flush_after unflushed blocks.
2972  *
2973  * We can guarantee that async commits reach disk after at most three
2974  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2975  * to write "flexibly", meaning it can stop at the end of the buffer ring;
2976  * this makes a difference only with very high load or long wal_writer_delay,
2977  * but imposes one extra cycle for the worst case for async commits.)
2978  *
2979  * This routine is invoked periodically by the background walwriter process.
2980  *
2981  * Returns true if there was any work to do, even if we skipped flushing due
2982  * to wal_writer_delay/wal_writer_flush_after.
2983  */
2984 bool
2985 XLogBackgroundFlush(void)
2986 {
2987         XLogwrtRqst WriteRqst;
2988         bool            flexible = true;
2989         static TimestampTz lastflush;
2990         TimestampTz now;
2991         int                     flushbytes;
2992
2993         /* XLOG doesn't need flushing during recovery */
2994         if (RecoveryInProgress())
2995                 return false;
2996
2997         /* read LogwrtResult and update local state */
2998         SpinLockAcquire(&XLogCtl->info_lck);
2999         LogwrtResult = XLogCtl->LogwrtResult;
3000         WriteRqst = XLogCtl->LogwrtRqst;
3001         SpinLockRelease(&XLogCtl->info_lck);
3002
3003         /* back off to last completed page boundary */
3004         WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3005
3006         /* if we have already flushed that far, consider async commit records */
3007         if (WriteRqst.Write <= LogwrtResult.Flush)
3008         {
3009                 SpinLockAcquire(&XLogCtl->info_lck);
3010                 WriteRqst.Write = XLogCtl->asyncXactLSN;
3011                 SpinLockRelease(&XLogCtl->info_lck);
3012                 flexible = false;               /* ensure it all gets written */
3013         }
3014
3015         /*
3016          * If already known flushed, we're done. Just need to check if we are
3017          * holding an open file handle to a logfile that's no longer in use,
3018          * preventing the file from being deleted.
3019          */
3020         if (WriteRqst.Write <= LogwrtResult.Flush)
3021         {
3022                 if (openLogFile >= 0)
3023                 {
3024                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3025                                                                  wal_segment_size))
3026                         {
3027                                 XLogFileClose();
3028                         }
3029                 }
3030                 return false;
3031         }
3032
3033         /*
3034          * Determine how far to flush WAL, based on the wal_writer_delay and
3035          * wal_writer_flush_after GUCs.
3036          */
3037         now = GetCurrentTimestamp();
3038         flushbytes =
3039                 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3040
3041         if (WalWriterFlushAfter == 0 || lastflush == 0)
3042         {
3043                 /* first call, or block based limits disabled */
3044                 WriteRqst.Flush = WriteRqst.Write;
3045                 lastflush = now;
3046         }
3047         else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3048         {
3049                 /*
3050                  * Flush the writes at least every WalWriteDelay ms. This is important
3051                  * to bound the amount of time it takes for an asynchronous commit to
3052                  * hit disk.
3053                  */
3054                 WriteRqst.Flush = WriteRqst.Write;
3055                 lastflush = now;
3056         }
3057         else if (flushbytes >= WalWriterFlushAfter)
3058         {
3059                 /* exceeded wal_writer_flush_after blocks, flush */
3060                 WriteRqst.Flush = WriteRqst.Write;
3061                 lastflush = now;
3062         }
3063         else
3064         {
3065                 /* no flushing, this time round */
3066                 WriteRqst.Flush = 0;
3067         }
3068
3069 #ifdef WAL_DEBUG
3070         if (XLOG_DEBUG)
3071                 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3072                          (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3073                          (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3074                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3075                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3076 #endif
3077
3078         START_CRIT_SECTION();
3079
3080         /* now wait for any in-progress insertions to finish and get write lock */
3081         WaitXLogInsertionsToFinish(WriteRqst.Write);
3082         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3083         LogwrtResult = XLogCtl->LogwrtResult;
3084         if (WriteRqst.Write > LogwrtResult.Write ||
3085                 WriteRqst.Flush > LogwrtResult.Flush)
3086         {
3087                 XLogWrite(WriteRqst, flexible);
3088         }
3089         LWLockRelease(WALWriteLock);
3090
3091         END_CRIT_SECTION();
3092
3093         /* wake up walsenders now that we've released heavily contended locks */
3094         WalSndWakeupProcessRequests();
3095
3096         /*
3097          * Great, done. To take some work off the critical path, try to initialize
3098          * as many of the no-longer-needed WAL buffers for future use as we can.
3099          */
3100         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3101
3102         /*
3103          * If we determined that we need to write data, but somebody else
3104          * wrote/flushed already, it should be considered as being active, to
3105          * avoid hibernating too early.
3106          */
3107         return true;
3108 }
3109
3110 /*
3111  * Test whether XLOG data has been flushed up to (at least) the given position.
3112  *
3113  * Returns true if a flush is still needed.  (It may be that someone else
3114  * is already in process of flushing that far, however.)
3115  */
3116 bool
3117 XLogNeedsFlush(XLogRecPtr record)
3118 {
3119         /*
3120          * During recovery, we don't flush WAL but update minRecoveryPoint
3121          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3122          * would need to be updated.
3123          */
3124         if (RecoveryInProgress())
3125         {
3126                 /*
3127                  * An invalid minRecoveryPoint means that we need to recover all the
3128                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3129                  * file's value in that case, so we can short-circuit future checks
3130                  * here too.
3131                  */
3132                 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3133                         updateMinRecoveryPoint = false;
3134
3135                 /* Quick exit if already known to be updated or cannot be updated */
3136                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3137                         return false;
3138
3139                 /*
3140                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3141                  * just return a conservative guess.
3142                  */
3143                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3144                         return true;
3145                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3146                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3147                 LWLockRelease(ControlFileLock);
3148
3149                 /* check again */
3150                 return record > minRecoveryPoint;
3151         }
3152
3153         /* Quick exit if already known flushed */
3154         if (record <= LogwrtResult.Flush)
3155                 return false;
3156
3157         /* read LogwrtResult and update local state */
3158         SpinLockAcquire(&XLogCtl->info_lck);
3159         LogwrtResult = XLogCtl->LogwrtResult;
3160         SpinLockRelease(&XLogCtl->info_lck);
3161
3162         /* check again */
3163         if (record <= LogwrtResult.Flush)
3164                 return false;
3165
3166         return true;
3167 }
3168
3169 /*
3170  * Create a new XLOG file segment, or open a pre-existing one.
3171  *
3172  * log, seg: identify segment to be created/opened.
3173  *
3174  * *use_existent: if true, OK to use a pre-existing file (else, any
3175  * pre-existing file will be deleted).  On return, true if a pre-existing
3176  * file was used.
3177  *
3178  * use_lock: if true, acquire ControlFileLock while moving file into
3179  * place.  This should be true except during bootstrap log creation.  The
3180  * caller must *not* hold the lock at call.
3181  *
3182  * Returns FD of opened file.
3183  *
3184  * Note: errors here are ERROR not PANIC because we might or might not be
3185  * inside a critical section (eg, during checkpoint there is no reason to
3186  * take down the system on failure).  They will promote to PANIC if we are
3187  * in a critical section.
3188  */
3189 int
3190 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3191 {
3192         char            path[MAXPGPATH];
3193         char            tmppath[MAXPGPATH];
3194         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
3195         char       *zbuffer;
3196         XLogSegNo       installed_segno;
3197         XLogSegNo       max_segno;
3198         int                     fd;
3199         int                     nbytes;
3200
3201         XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3202
3203         /*
3204          * Try to use existent file (checkpoint maker may have created it already)
3205          */
3206         if (*use_existent)
3207         {
3208                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3209                 if (fd < 0)
3210                 {
3211                         if (errno != ENOENT)
3212                                 ereport(ERROR,
3213                                                 (errcode_for_file_access(),
3214                                                  errmsg("could not open file \"%s\": %m", path)));
3215                 }
3216                 else
3217                         return fd;
3218         }
3219
3220         /*
3221          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3222          * another process is doing the same thing.  If so, we will end up
3223          * pre-creating an extra log segment.  That seems OK, and better than
3224          * holding the lock throughout this lengthy process.
3225          */
3226         elog(DEBUG2, "creating and filling new WAL file");
3227
3228         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3229
3230         unlink(tmppath);
3231
3232         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3233         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3234         if (fd < 0)
3235                 ereport(ERROR,
3236                                 (errcode_for_file_access(),
3237                                  errmsg("could not create file \"%s\": %m", tmppath)));
3238
3239         /*
3240          * Zero-fill the file.  We have to do this the hard way to ensure that all
3241          * the file space has really been allocated --- on platforms that allow
3242          * "holes" in files, just seeking to the end doesn't allocate intermediate
3243          * space.  This way, we know that we have all the space and (after the
3244          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3245          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3246          * log file.
3247          *
3248          * Note: ensure the buffer is reasonably well-aligned; this may save a few
3249          * cycles transferring data to the kernel.
3250          */
3251         zbuffer = (char *) MAXALIGN(zbuffer_raw);
3252         memset(zbuffer, 0, XLOG_BLCKSZ);
3253         for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3254         {
3255                 errno = 0;
3256                 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3257                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3258                 {
3259                         int                     save_errno = errno;
3260
3261                         /*
3262                          * If we fail to make the file, delete it to release disk space
3263                          */
3264                         unlink(tmppath);
3265
3266                         close(fd);
3267
3268                         /* if write didn't set errno, assume problem is no disk space */
3269                         errno = save_errno ? save_errno : ENOSPC;
3270
3271                         ereport(ERROR,
3272                                         (errcode_for_file_access(),
3273                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3274                 }
3275                 pgstat_report_wait_end();
3276         }
3277
3278         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3279         if (pg_fsync(fd) != 0)
3280         {
3281                 int                     save_errno = errno;
3282
3283                 close(fd);
3284                 errno = save_errno;
3285                 ereport(ERROR,
3286                                 (errcode_for_file_access(),
3287                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3288         }
3289         pgstat_report_wait_end();
3290
3291         if (close(fd))
3292                 ereport(ERROR,
3293                                 (errcode_for_file_access(),
3294                                  errmsg("could not close file \"%s\": %m", tmppath)));
3295
3296         /*
3297          * Now move the segment into place with its final name.
3298          *
3299          * If caller didn't want to use a pre-existing file, get rid of any
3300          * pre-existing file.  Otherwise, cope with possibility that someone else
3301          * has created the file while we were filling ours: if so, use ours to
3302          * pre-create a future log segment.
3303          */
3304         installed_segno = logsegno;
3305
3306         /*
3307          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3308          * that was a constant, but that was always a bit dubious: normally, at a
3309          * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3310          * here, it was the offset from the insert location. We can't do the
3311          * normal XLOGfileslop calculation here because we don't have access to
3312          * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3313          * CheckPointSegments.
3314          */
3315         max_segno = logsegno + CheckPointSegments;
3316         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3317                                                                 *use_existent, max_segno,
3318                                                                 use_lock))
3319         {
3320                 /*
3321                  * No need for any more future segments, or InstallXLogFileSegment()
3322                  * failed to rename the file into place. If the rename failed, opening
3323                  * the file below will fail.
3324                  */
3325                 unlink(tmppath);
3326         }
3327
3328         /* Set flag to tell caller there was no existent file */
3329         *use_existent = false;
3330
3331         /* Now open original target segment (might not be file I just made) */
3332         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3333         if (fd < 0)
3334                 ereport(ERROR,
3335                                 (errcode_for_file_access(),
3336                                  errmsg("could not open file \"%s\": %m", path)));
3337
3338         elog(DEBUG2, "done creating and filling new WAL file");
3339
3340         return fd;
3341 }
3342
3343 /*
3344  * Create a new XLOG file segment by copying a pre-existing one.
3345  *
3346  * destsegno: identify segment to be created.
3347  *
3348  * srcTLI, srcsegno: identify segment to be copied (could be from
3349  *              a different timeline)
3350  *
3351  * upto: how much of the source file to copy (the rest is filled with
3352  *              zeros)
3353  *
3354  * Currently this is only used during recovery, and so there are no locking
3355  * considerations.  But we should be just as tense as XLogFileInit to avoid
3356  * emplacing a bogus file.
3357  */
3358 static void
3359 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3360                          int upto)
3361 {
3362         char            path[MAXPGPATH];
3363         char            tmppath[MAXPGPATH];
3364         char            buffer[XLOG_BLCKSZ];
3365         int                     srcfd;
3366         int                     fd;
3367         int                     nbytes;
3368
3369         /*
3370          * Open the source file
3371          */
3372         XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3373         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3374         if (srcfd < 0)
3375                 ereport(ERROR,
3376                                 (errcode_for_file_access(),
3377                                  errmsg("could not open file \"%s\": %m", path)));
3378
3379         /*
3380          * Copy into a temp file name.
3381          */
3382         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3383
3384         unlink(tmppath);
3385
3386         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3387         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3388         if (fd < 0)
3389                 ereport(ERROR,
3390                                 (errcode_for_file_access(),
3391                                  errmsg("could not create file \"%s\": %m", tmppath)));
3392
3393         /*
3394          * Do the data copying.
3395          */
3396         for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3397         {
3398                 int                     nread;
3399
3400                 nread = upto - nbytes;
3401
3402                 /*
3403                  * The part that is not read from the source file is filled with
3404                  * zeros.
3405                  */
3406                 if (nread < sizeof(buffer))
3407                         memset(buffer, 0, sizeof(buffer));
3408
3409                 if (nread > 0)
3410                 {
3411                         int                     r;
3412
3413                         if (nread > sizeof(buffer))
3414                                 nread = sizeof(buffer);
3415                         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3416                         r = read(srcfd, buffer, nread);
3417                         if (r != nread)
3418                         {
3419                                 if (r < 0)
3420                                         ereport(ERROR,
3421                                                         (errcode_for_file_access(),
3422                                                          errmsg("could not read file \"%s\": %m",
3423                                                                         path)));
3424                                 else
3425                                         ereport(ERROR,
3426                                                         (errcode(ERRCODE_DATA_CORRUPTED),
3427                                                          errmsg("could not read file \"%s\": read %d of %zu",
3428                                                                         path, r, (Size) nread)));
3429                         }
3430                         pgstat_report_wait_end();
3431                 }
3432                 errno = 0;
3433                 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3434                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3435                 {
3436                         int                     save_errno = errno;
3437
3438                         /*
3439                          * If we fail to make the file, delete it to release disk space
3440                          */
3441                         unlink(tmppath);
3442                         /* if write didn't set errno, assume problem is no disk space */
3443                         errno = save_errno ? save_errno : ENOSPC;
3444
3445                         ereport(ERROR,
3446                                         (errcode_for_file_access(),
3447                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3448                 }
3449                 pgstat_report_wait_end();
3450         }
3451
3452         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3453         if (pg_fsync(fd) != 0)
3454                 ereport(ERROR,
3455                                 (errcode_for_file_access(),
3456                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3457         pgstat_report_wait_end();
3458
3459         if (CloseTransientFile(fd))
3460                 ereport(ERROR,
3461                                 (errcode_for_file_access(),
3462                                  errmsg("could not close file \"%s\": %m", tmppath)));
3463
3464         CloseTransientFile(srcfd);
3465
3466         /*
3467          * Now move the segment into place with its final name.
3468          */
3469         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3470                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3471 }
3472
3473 /*
3474  * Install a new XLOG segment file as a current or future log segment.
3475  *
3476  * This is used both to install a newly-created segment (which has a temp
3477  * filename while it's being created) and to recycle an old segment.
3478  *
3479  * *segno: identify segment to install as (or first possible target).
3480  * When find_free is true, this is modified on return to indicate the
3481  * actual installation location or last segment searched.
3482  *
3483  * tmppath: initial name of file to install.  It will be renamed into place.
3484  *
3485  * find_free: if true, install the new segment at the first empty segno
3486  * number at or after the passed numbers.  If false, install the new segment
3487  * exactly where specified, deleting any existing segment file there.
3488  *
3489  * max_segno: maximum segment number to install the new file as.  Fail if no
3490  * free slot is found between *segno and max_segno. (Ignored when find_free
3491  * is false.)
3492  *
3493  * use_lock: if true, acquire ControlFileLock while moving file into
3494  * place.  This should be true except during bootstrap log creation.  The
3495  * caller must *not* hold the lock at call.
3496  *
3497  * Returns true if the file was installed successfully.  false indicates that
3498  * max_segno limit was exceeded, or an error occurred while renaming the
3499  * file into place.
3500  */
3501 static bool
3502 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3503                                            bool find_free, XLogSegNo max_segno,
3504                                            bool use_lock)
3505 {
3506         char            path[MAXPGPATH];
3507         struct stat stat_buf;
3508
3509         XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3510
3511         /*
3512          * We want to be sure that only one process does this at a time.
3513          */
3514         if (use_lock)
3515                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3516
3517         if (!find_free)
3518         {
3519                 /* Force installation: get rid of any pre-existing segment file */
3520                 durable_unlink(path, DEBUG1);
3521         }
3522         else
3523         {
3524                 /* Find a free slot to put it in */
3525                 while (stat(path, &stat_buf) == 0)
3526                 {
3527                         if ((*segno) >= max_segno)
3528                         {
3529                                 /* Failed to find a free slot within specified range */
3530                                 if (use_lock)
3531                                         LWLockRelease(ControlFileLock);
3532                                 return false;
3533                         }
3534                         (*segno)++;
3535                         XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3536                 }
3537         }
3538
3539         /*
3540          * Perform the rename using link if available, paranoidly trying to avoid
3541          * overwriting an existing file (there shouldn't be one).
3542          */
3543         if (durable_link_or_rename(tmppath, path, LOG) != 0)
3544         {
3545                 if (use_lock)
3546                         LWLockRelease(ControlFileLock);
3547                 /* durable_link_or_rename already emitted log message */
3548                 return false;
3549         }
3550
3551         if (use_lock)
3552                 LWLockRelease(ControlFileLock);
3553
3554         return true;
3555 }
3556
3557 /*
3558  * Open a pre-existing logfile segment for writing.
3559  */
3560 int
3561 XLogFileOpen(XLogSegNo segno)
3562 {
3563         char            path[MAXPGPATH];
3564         int                     fd;
3565
3566         XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3567
3568         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3569         if (fd < 0)
3570                 ereport(PANIC,
3571                                 (errcode_for_file_access(),
3572                                  errmsg("could not open file \"%s\": %m", path)));
3573
3574         return fd;
3575 }
3576
3577 /*
3578  * Open a logfile segment for reading (during recovery).
3579  *
3580  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3581  * Otherwise, it's assumed to be already available in pg_wal.
3582  */
3583 static int
3584 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3585                          int source, bool notfoundOk)
3586 {
3587         char            xlogfname[MAXFNAMELEN];
3588         char            activitymsg[MAXFNAMELEN + 16];
3589         char            path[MAXPGPATH];
3590         int                     fd;
3591
3592         XLogFileName(xlogfname, tli, segno, wal_segment_size);
3593
3594         switch (source)
3595         {
3596                 case XLOG_FROM_ARCHIVE:
3597                         /* Report recovery progress in PS display */
3598                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3599                                          xlogfname);
3600                         set_ps_display(activitymsg, false);
3601
3602                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3603                                                                                                           "RECOVERYXLOG",
3604                                                                                                           wal_segment_size,
3605                                                                                                           InRedo);
3606                         if (!restoredFromArchive)
3607                                 return -1;
3608                         break;
3609
3610                 case XLOG_FROM_PG_WAL:
3611                 case XLOG_FROM_STREAM:
3612                         XLogFilePath(path, tli, segno, wal_segment_size);
3613                         restoredFromArchive = false;
3614                         break;
3615
3616                 default:
3617                         elog(ERROR, "invalid XLogFileRead source %d", source);
3618         }
3619
3620         /*
3621          * If the segment was fetched from archival storage, replace the existing
3622          * xlog segment (if any) with the archival version.
3623          */
3624         if (source == XLOG_FROM_ARCHIVE)
3625         {
3626                 KeepFileRestoredFromArchive(path, xlogfname);
3627
3628                 /*
3629                  * Set path to point at the new file in pg_wal.
3630                  */
3631                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3632         }
3633
3634         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3635         if (fd >= 0)
3636         {
3637                 /* Success! */
3638                 curFileTLI = tli;
3639
3640                 /* Report recovery progress in PS display */
3641                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3642                                  xlogfname);
3643                 set_ps_display(activitymsg, false);
3644
3645                 /* Track source of data in assorted state variables */
3646                 readSource = source;
3647                 XLogReceiptSource = source;
3648                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3649                 if (source != XLOG_FROM_STREAM)
3650                         XLogReceiptTime = GetCurrentTimestamp();
3651
3652                 return fd;
3653         }
3654         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3655                 ereport(PANIC,
3656                                 (errcode_for_file_access(),
3657                                  errmsg("could not open file \"%s\": %m", path)));
3658         return -1;
3659 }
3660
3661 /*
3662  * Open a logfile segment for reading (during recovery).
3663  *
3664  * This version searches for the segment with any TLI listed in expectedTLEs.
3665  */
3666 static int
3667 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3668 {
3669         char            path[MAXPGPATH];
3670         ListCell   *cell;
3671         int                     fd;
3672         List       *tles;
3673
3674         /*
3675          * Loop looking for a suitable timeline ID: we might need to read any of
3676          * the timelines listed in expectedTLEs.
3677          *
3678          * We expect curFileTLI on entry to be the TLI of the preceding file in
3679          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3680          * to go backwards; this prevents us from picking up the wrong file when a
3681          * parent timeline extends to higher segment numbers than the child we
3682          * want to read.
3683          *
3684          * If we haven't read the timeline history file yet, read it now, so that
3685          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3686          * however, unless we actually find a valid segment.  That way if there is
3687          * neither a timeline history file nor a WAL segment in the archive, and
3688          * streaming replication is set up, we'll read the timeline history file
3689          * streamed from the master when we start streaming, instead of recovering
3690          * with a dummy history generated here.
3691          */
3692         if (expectedTLEs)
3693                 tles = expectedTLEs;
3694         else
3695                 tles = readTimeLineHistory(recoveryTargetTLI);
3696
3697         foreach(cell, tles)
3698         {
3699                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3700
3701                 if (tli < curFileTLI)
3702                         break;                          /* don't bother looking at too-old TLIs */
3703
3704                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3705                 {
3706                         fd = XLogFileRead(segno, emode, tli,
3707                                                           XLOG_FROM_ARCHIVE, true);
3708                         if (fd != -1)
3709                         {
3710                                 elog(DEBUG1, "got WAL segment from archive");
3711                                 if (!expectedTLEs)
3712                                         expectedTLEs = tles;
3713                                 return fd;
3714                         }
3715                 }
3716
3717                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3718                 {
3719                         fd = XLogFileRead(segno, emode, tli,
3720                                                           XLOG_FROM_PG_WAL, true);
3721                         if (fd != -1)
3722                         {
3723                                 if (!expectedTLEs)
3724                                         expectedTLEs = tles;
3725                                 return fd;
3726                         }
3727                 }
3728         }
3729
3730         /* Couldn't find it.  For simplicity, complain about front timeline */
3731         XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3732         errno = ENOENT;
3733         ereport(emode,
3734                         (errcode_for_file_access(),
3735                          errmsg("could not open file \"%s\": %m", path)));
3736         return -1;
3737 }
3738
3739 /*
3740  * Close the current logfile segment for writing.
3741  */
3742 static void
3743 XLogFileClose(void)
3744 {
3745         Assert(openLogFile >= 0);
3746
3747         /*
3748          * WAL segment files will not be re-read in normal operation, so we advise
3749          * the OS to release any cached pages.  But do not do so if WAL archiving
3750          * or streaming is active, because archiver and walsender process could
3751          * use the cache to read the WAL segment.
3752          */
3753 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3754         if (!XLogIsNeeded())
3755                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3756 #endif
3757
3758         if (close(openLogFile))
3759                 ereport(PANIC,
3760                                 (errcode_for_file_access(),
3761                                  errmsg("could not close file \"%s\": %m",
3762                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3763         openLogFile = -1;
3764 }
3765
3766 /*
3767  * Preallocate log files beyond the specified log endpoint.
3768  *
3769  * XXX this is currently extremely conservative, since it forces only one
3770  * future log segment to exist, and even that only if we are 75% done with
3771  * the current one.  This is only appropriate for very low-WAL-volume systems.
3772  * High-volume systems will be OK once they've built up a sufficient set of
3773  * recycled log segments, but the startup transient is likely to include
3774  * a lot of segment creations by foreground processes, which is not so good.
3775  */
3776 static void
3777 PreallocXlogFiles(XLogRecPtr endptr)
3778 {
3779         XLogSegNo       _logSegNo;
3780         int                     lf;
3781         bool            use_existent;
3782         uint64          offset;
3783
3784         XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3785         offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3786         if (offset >= (uint32) (0.75 * wal_segment_size))
3787         {
3788                 _logSegNo++;
3789                 use_existent = true;
3790                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3791                 close(lf);
3792                 if (!use_existent)
3793                         CheckpointStats.ckpt_segs_added++;
3794         }
3795 }
3796
3797 /*
3798  * Throws an error if the given log segment has already been removed or
3799  * recycled. The caller should only pass a segment that it knows to have
3800  * existed while the server has been running, as this function always
3801  * succeeds if no WAL segments have been removed since startup.
3802  * 'tli' is only used in the error message.
3803  *
3804  * Note: this function guarantees to keep errno unchanged on return.
3805  * This supports callers that use this to possibly deliver a better
3806  * error message about a missing file, while still being able to throw
3807  * a normal file-access error afterwards, if this does return.
3808  */
3809 void
3810 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3811 {
3812         int                     save_errno = errno;
3813         XLogSegNo       lastRemovedSegNo;
3814
3815         SpinLockAcquire(&XLogCtl->info_lck);
3816         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3817         SpinLockRelease(&XLogCtl->info_lck);
3818
3819         if (segno <= lastRemovedSegNo)
3820         {
3821                 char            filename[MAXFNAMELEN];
3822
3823                 XLogFileName(filename, tli, segno, wal_segment_size);
3824                 errno = save_errno;
3825                 ereport(ERROR,
3826                                 (errcode_for_file_access(),
3827                                  errmsg("requested WAL segment %s has already been removed",
3828                                                 filename)));
3829         }
3830         errno = save_errno;
3831 }
3832
3833 /*
3834  * Return the last WAL segment removed, or 0 if no segment has been removed
3835  * since startup.
3836  *
3837  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3838  * with that.
3839  */
3840 XLogSegNo
3841 XLogGetLastRemovedSegno(void)
3842 {
3843         XLogSegNo       lastRemovedSegNo;
3844
3845         SpinLockAcquire(&XLogCtl->info_lck);
3846         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3847         SpinLockRelease(&XLogCtl->info_lck);
3848
3849         return lastRemovedSegNo;
3850 }
3851
3852 /*
3853  * Update the last removed segno pointer in shared memory, to reflect
3854  * that the given XLOG file has been removed.
3855  */
3856 static void
3857 UpdateLastRemovedPtr(char *filename)
3858 {
3859         uint32          tli;
3860         XLogSegNo       segno;
3861
3862         XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3863
3864         SpinLockAcquire(&XLogCtl->info_lck);
3865         if (segno > XLogCtl->lastRemovedSegNo)
3866                 XLogCtl->lastRemovedSegNo = segno;
3867         SpinLockRelease(&XLogCtl->info_lck);
3868 }
3869
3870 /*
3871  * Remove all temporary log files in pg_wal
3872  *
3873  * This is called at the beginning of recovery after a previous crash,
3874  * at a point where no other processes write fresh WAL data.
3875  */
3876 static void
3877 RemoveTempXlogFiles(void)
3878 {
3879         DIR                *xldir;
3880         struct dirent *xlde;
3881
3882         elog(DEBUG2, "removing all temporary WAL segments");
3883
3884         xldir = AllocateDir(XLOGDIR);
3885         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3886         {
3887                 char            path[MAXPGPATH];
3888
3889                 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3890                         continue;
3891
3892                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3893                 unlink(path);
3894                 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3895         }
3896         FreeDir(xldir);
3897 }
3898
3899 /*
3900  * Recycle or remove all log files older or equal to passed segno.
3901  *
3902  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3903  * redo pointer of the previous checkpoint. These are used to determine
3904  * whether we want to recycle rather than delete no-longer-wanted log files.
3905  */
3906 static void
3907 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3908 {
3909         DIR                *xldir;
3910         struct dirent *xlde;
3911         char            lastoff[MAXFNAMELEN];
3912
3913         /*
3914          * Construct a filename of the last segment to be kept. The timeline ID
3915          * doesn't matter, we ignore that in the comparison. (During recovery,
3916          * ThisTimeLineID isn't set, so we can't use that.)
3917          */
3918         XLogFileName(lastoff, 0, segno, wal_segment_size);
3919
3920         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3921                  lastoff);
3922
3923         xldir = AllocateDir(XLOGDIR);
3924
3925         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3926         {
3927                 /* Ignore files that are not XLOG segments */
3928                 if (!IsXLogFileName(xlde->d_name) &&
3929                         !IsPartialXLogFileName(xlde->d_name))
3930                         continue;
3931
3932                 /*
3933                  * We ignore the timeline part of the XLOG segment identifiers in
3934                  * deciding whether a segment is still needed.  This ensures that we
3935                  * won't prematurely remove a segment from a parent timeline. We could
3936                  * probably be a little more proactive about removing segments of
3937                  * non-parent timelines, but that would be a whole lot more
3938                  * complicated.
3939                  *
3940                  * We use the alphanumeric sorting property of the filenames to decide
3941                  * which ones are earlier than the lastoff segment.
3942                  */
3943                 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3944                 {
3945                         if (XLogArchiveCheckDone(xlde->d_name))
3946                         {
3947                                 /* Update the last removed location in shared memory first */
3948                                 UpdateLastRemovedPtr(xlde->d_name);
3949
3950                                 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
3951                         }
3952                 }
3953         }
3954
3955         FreeDir(xldir);
3956 }
3957
3958 /*
3959  * Remove WAL files that are not part of the given timeline's history.
3960  *
3961  * This is called during recovery, whenever we switch to follow a new
3962  * timeline, and at the end of recovery when we create a new timeline. We
3963  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3964  * might be leftover pre-allocated or recycled WAL segments on the old timeline
3965  * that we haven't used yet, and contain garbage. If we just leave them in
3966  * pg_wal, they will eventually be archived, and we can't let that happen.
3967  * Files that belong to our timeline history are valid, because we have
3968  * successfully replayed them, but from others we can't be sure.
3969  *
3970  * 'switchpoint' is the current point in WAL where we switch to new timeline,
3971  * and 'newTLI' is the new timeline we switch to.
3972  */
3973 static void
3974 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3975 {
3976         DIR                *xldir;
3977         struct dirent *xlde;
3978         char            switchseg[MAXFNAMELEN];
3979         XLogSegNo       endLogSegNo;
3980
3981         XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
3982
3983         /*
3984          * Construct a filename of the last segment to be kept.
3985          */
3986         XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
3987
3988         elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3989                  switchseg);
3990
3991         xldir = AllocateDir(XLOGDIR);
3992
3993         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3994         {
3995                 /* Ignore files that are not XLOG segments */
3996                 if (!IsXLogFileName(xlde->d_name))
3997                         continue;
3998
3999                 /*
4000                  * Remove files that are on a timeline older than the new one we're
4001                  * switching to, but with a segment number >= the first segment on the
4002                  * new timeline.
4003                  */
4004                 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4005                         strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4006                 {
4007                         /*
4008                          * If the file has already been marked as .ready, however, don't
4009                          * remove it yet. It should be OK to remove it - files that are
4010                          * not part of our timeline history are not required for recovery
4011                          * - but seems safer to let them be archived and removed later.
4012                          */
4013                         if (!XLogArchiveIsReady(xlde->d_name))
4014                                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4015                 }
4016         }
4017
4018         FreeDir(xldir);
4019 }
4020
4021 /*
4022  * Recycle or remove a log file that's no longer needed.
4023  *
4024  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
4025  * redo pointer of the previous checkpoint. These are used to determine
4026  * whether we want to recycle rather than delete no-longer-wanted log files.
4027  * If PriorRedoRecPtr is not known, pass invalid, and the function will
4028  * recycle, somewhat arbitrarily, 10 future segments.
4029  */
4030 static void
4031 RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
4032 {
4033         char            path[MAXPGPATH];
4034 #ifdef WIN32
4035         char            newpath[MAXPGPATH];
4036 #endif
4037         struct stat statbuf;
4038         XLogSegNo       endlogSegNo;
4039         XLogSegNo       recycleSegNo;
4040
4041         /*
4042          * Initialize info about where to try to recycle to.
4043          */
4044         XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4045         if (PriorRedoPtr == InvalidXLogRecPtr)
4046                 recycleSegNo = endlogSegNo + 10;
4047         else
4048                 recycleSegNo = XLOGfileslop(PriorRedoPtr);
4049
4050         snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4051
4052         /*
4053          * Before deleting the file, see if it can be recycled as a future log
4054          * segment. Only recycle normal files, pg_standby for example can create
4055          * symbolic links pointing to a separate archive directory.
4056          */
4057         if (endlogSegNo <= recycleSegNo &&
4058                 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4059                 InstallXLogFileSegment(&endlogSegNo, path,
4060                                                            true, recycleSegNo, true))
4061         {
4062                 ereport(DEBUG2,
4063                                 (errmsg("recycled write-ahead log file \"%s\"",
4064                                                 segname)));
4065                 CheckpointStats.ckpt_segs_recycled++;
4066                 /* Needn't recheck that slot on future iterations */
4067                 endlogSegNo++;
4068         }
4069         else
4070         {
4071                 /* No need for any more future segments... */
4072                 int                     rc;
4073
4074                 ereport(DEBUG2,
4075                                 (errmsg("removing write-ahead log file \"%s\"",
4076                                                 segname)));
4077
4078 #ifdef WIN32
4079
4080                 /*
4081                  * On Windows, if another process (e.g another backend) holds the file
4082                  * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4083                  * will still show up in directory listing until the last handle is
4084                  * closed. To avoid confusing the lingering deleted file for a live
4085                  * WAL file that needs to be archived, rename it before deleting it.
4086                  *
4087                  * If another process holds the file open without FILE_SHARE_DELETE
4088                  * flag, rename will fail. We'll try again at the next checkpoint.
4089                  */
4090                 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4091                 if (rename(path, newpath) != 0)
4092                 {
4093                         ereport(LOG,
4094                                         (errcode_for_file_access(),
4095                                          errmsg("could not rename file \"%s\": %m",
4096                                                         path)));
4097                         return;
4098                 }
4099                 rc = durable_unlink(newpath, LOG);
4100 #else
4101                 rc = durable_unlink(path, LOG);
4102 #endif
4103                 if (rc != 0)
4104                 {
4105                         /* Message already logged by durable_unlink() */
4106                         return;
4107                 }
4108                 CheckpointStats.ckpt_segs_removed++;
4109         }
4110
4111         XLogArchiveCleanup(segname);
4112 }
4113
4114 /*
4115  * Verify whether pg_wal and pg_wal/archive_status exist.
4116  * If the latter does not exist, recreate it.
4117  *
4118  * It is not the goal of this function to verify the contents of these
4119  * directories, but to help in cases where someone has performed a cluster
4120  * copy for PITR purposes but omitted pg_wal from the copy.
4121  *
4122  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4123  * policy decision was made not to.  It is fairly common for pg_wal to be
4124  * a symlink, and if that was the DBA's intent then automatically making a
4125  * plain directory would result in degraded performance with no notice.
4126  */
4127 static void
4128 ValidateXLOGDirectoryStructure(void)
4129 {
4130         char            path[MAXPGPATH];
4131         struct stat stat_buf;
4132
4133         /* Check for pg_wal; if it doesn't exist, error out */
4134         if (stat(XLOGDIR, &stat_buf) != 0 ||
4135                 !S_ISDIR(stat_buf.st_mode))
4136                 ereport(FATAL,
4137                                 (errmsg("required WAL directory \"%s\" does not exist",
4138                                                 XLOGDIR)));
4139
4140         /* Check for archive_status */
4141         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4142         if (stat(path, &stat_buf) == 0)
4143         {
4144                 /* Check for weird cases where it exists but isn't a directory */
4145                 if (!S_ISDIR(stat_buf.st_mode))
4146                         ereport(FATAL,
4147                                         (errmsg("required WAL directory \"%s\" does not exist",
4148                                                         path)));
4149         }
4150         else
4151         {
4152                 ereport(LOG,
4153                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4154                 if (MakePGDirectory(path) < 0)
4155                         ereport(FATAL,
4156                                         (errmsg("could not create missing directory \"%s\": %m",
4157                                                         path)));
4158         }
4159 }
4160
4161 /*
4162  * Remove previous backup history files.  This also retries creation of
4163  * .ready files for any backup history files for which XLogArchiveNotify
4164  * failed earlier.
4165  */
4166 static void
4167 CleanupBackupHistory(void)
4168 {
4169         DIR                *xldir;
4170         struct dirent *xlde;
4171         char            path[MAXPGPATH + sizeof(XLOGDIR)];
4172
4173         xldir = AllocateDir(XLOGDIR);
4174
4175         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4176         {
4177                 if (IsBackupHistoryFileName(xlde->d_name))
4178                 {
4179                         if (XLogArchiveCheckDone(xlde->d_name))
4180                         {
4181                                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4182                                          xlde->d_name);
4183                                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4184                                 unlink(path);
4185                                 XLogArchiveCleanup(xlde->d_name);
4186                         }
4187                 }
4188         }
4189
4190         FreeDir(xldir);
4191 }
4192
4193 /*
4194  * Attempt to read an XLOG record.
4195  *
4196  * If RecPtr is valid, try to read a record at that position.  Otherwise
4197  * try to read a record just after the last one previously read.
4198  *
4199  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4200  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4201  * record is available.
4202  *
4203  * The record is copied into readRecordBuf, so that on successful return,
4204  * the returned record pointer always points there.
4205  */
4206 static XLogRecord *
4207 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4208                    bool fetching_ckpt)
4209 {
4210         XLogRecord *record;
4211         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4212
4213         /* Pass through parameters to XLogPageRead */
4214         private->fetching_ckpt = fetching_ckpt;
4215         private->emode = emode;
4216         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4217
4218         /* This is the first attempt to read this page. */
4219         lastSourceFailed = false;
4220
4221         for (;;)
4222         {
4223                 char       *errormsg;
4224
4225                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4226                 ReadRecPtr = xlogreader->ReadRecPtr;
4227                 EndRecPtr = xlogreader->EndRecPtr;
4228                 if (record == NULL)
4229                 {
4230                         if (readFile >= 0)
4231                         {
4232                                 close(readFile);
4233                                 readFile = -1;
4234                         }
4235
4236                         /*
4237                          * We only end up here without a message when XLogPageRead()
4238                          * failed - in that case we already logged something. In
4239                          * StandbyMode that only happens if we have been triggered, so we
4240                          * shouldn't loop anymore in that case.
4241                          */
4242                         if (errormsg)
4243                                 ereport(emode_for_corrupt_record(emode,
4244                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4245                                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4246                 }
4247
4248                 /*
4249                  * Check page TLI is one of the expected values.
4250                  */
4251                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4252                 {
4253                         char            fname[MAXFNAMELEN];
4254                         XLogSegNo       segno;
4255                         int32           offset;
4256
4257                         XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4258                         offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4259                                                                            wal_segment_size);
4260                         XLogFileName(fname, xlogreader->readPageTLI, segno,
4261                                                  wal_segment_size);
4262                         ereport(emode_for_corrupt_record(emode,
4263                                                                                          RecPtr ? RecPtr : EndRecPtr),
4264                                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4265                                                         xlogreader->latestPageTLI,
4266                                                         fname,
4267                                                         offset)));
4268                         record = NULL;
4269                 }
4270
4271                 if (record)
4272                 {
4273                         /* Great, got a record */
4274                         return record;
4275                 }
4276                 else
4277                 {
4278                         /* No valid record available from this source */
4279                         lastSourceFailed = true;
4280
4281                         /*
4282                          * If archive recovery was requested, but we were still doing
4283                          * crash recovery, switch to archive recovery and retry using the
4284                          * offline archive. We have now replayed all the valid WAL in
4285                          * pg_wal, so we are presumably now consistent.
4286                          *
4287                          * We require that there's at least some valid WAL present in
4288                          * pg_wal, however (!fetching_ckpt).  We could recover using the
4289                          * WAL from the archive, even if pg_wal is completely empty, but
4290                          * we'd have no idea how far we'd have to replay to reach
4291                          * consistency.  So err on the safe side and give up.
4292                          */
4293                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4294                                 !fetching_ckpt)
4295                         {
4296                                 ereport(DEBUG1,
4297                                                 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4298                                 InArchiveRecovery = true;
4299                                 if (StandbyModeRequested)
4300                                         StandbyMode = true;
4301
4302                                 /* initialize minRecoveryPoint to this record */
4303                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4304                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4305                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4306                                 {
4307                                         ControlFile->minRecoveryPoint = EndRecPtr;
4308                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4309                                 }
4310                                 /* update local copy */
4311                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4312                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4313
4314                                 /*
4315                                  * The startup process can update its local copy of
4316                                  * minRecoveryPoint from this point.
4317                                  */
4318                                 updateMinRecoveryPoint = true;
4319
4320                                 UpdateControlFile();
4321                                 LWLockRelease(ControlFileLock);
4322
4323                                 CheckRecoveryConsistency();
4324
4325                                 /*
4326                                  * Before we retry, reset lastSourceFailed and currentSource
4327                                  * so that we will check the archive next.
4328                                  */
4329                                 lastSourceFailed = false;
4330                                 currentSource = 0;
4331
4332                                 continue;
4333                         }
4334
4335                         /* In standby mode, loop back to retry. Otherwise, give up. */
4336                         if (StandbyMode && !CheckForStandbyTrigger())
4337                                 continue;
4338                         else
4339                                 return NULL;
4340                 }
4341         }
4342 }
4343
4344 /*
4345  * Scan for new timelines that might have appeared in the archive since we
4346  * started recovery.
4347  *
4348  * If there are any, the function changes recovery target TLI to the latest
4349  * one and returns 'true'.
4350  */
4351 static bool
4352 rescanLatestTimeLine(void)
4353 {
4354         List       *newExpectedTLEs;
4355         bool            found;
4356         ListCell   *cell;
4357         TimeLineID      newtarget;
4358         TimeLineID      oldtarget = recoveryTargetTLI;
4359         TimeLineHistoryEntry *currentTle = NULL;
4360
4361         newtarget = findNewestTimeLine(recoveryTargetTLI);
4362         if (newtarget == recoveryTargetTLI)
4363         {
4364                 /* No new timelines found */
4365                 return false;
4366         }
4367
4368         /*
4369          * Determine the list of expected TLIs for the new TLI
4370          */
4371
4372         newExpectedTLEs = readTimeLineHistory(newtarget);
4373
4374         /*
4375          * If the current timeline is not part of the history of the new timeline,
4376          * we cannot proceed to it.
4377          */
4378         found = false;
4379         foreach(cell, newExpectedTLEs)
4380         {
4381                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4382
4383                 if (currentTle->tli == recoveryTargetTLI)
4384                 {
4385                         found = true;
4386                         break;
4387                 }
4388         }
4389         if (!found)
4390         {
4391                 ereport(LOG,
4392                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4393                                                 newtarget,
4394                                                 ThisTimeLineID)));
4395                 return false;
4396         }
4397
4398         /*
4399          * The current timeline was found in the history file, but check that the
4400          * next timeline was forked off from it *after* the current recovery
4401          * location.
4402          */
4403         if (currentTle->end < EndRecPtr)
4404         {
4405                 ereport(LOG,
4406                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4407                                                 newtarget,
4408                                                 ThisTimeLineID,
4409                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4410                 return false;
4411         }
4412
4413         /* The new timeline history seems valid. Switch target */
4414         recoveryTargetTLI = newtarget;
4415         list_free_deep(expectedTLEs);
4416         expectedTLEs = newExpectedTLEs;
4417
4418         /*
4419          * As in StartupXLOG(), try to ensure we have all the history files
4420          * between the old target and new target in pg_wal.
4421          */
4422         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4423
4424         ereport(LOG,
4425                         (errmsg("new target timeline is %u",
4426                                         recoveryTargetTLI)));
4427
4428         return true;
4429 }
4430
4431 /*
4432  * I/O routines for pg_control
4433  *
4434  * *ControlFile is a buffer in shared memory that holds an image of the
4435  * contents of pg_control.  WriteControlFile() initializes pg_control
4436  * given a preloaded buffer, ReadControlFile() loads the buffer from
4437  * the pg_control file (during postmaster or standalone-backend startup),
4438  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4439  *
4440  * For simplicity, WriteControlFile() initializes the fields of pg_control
4441  * that are related to checking backend/database compatibility, and
4442  * ReadControlFile() verifies they are correct.  We could split out the
4443  * I/O and compatibility-check functions, but there seems no need currently.
4444  */
4445 static void
4446 WriteControlFile(void)
4447 {
4448         int                     fd;
4449         char            buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
4450
4451         /*
4452          * Ensure that the size of the pg_control data structure is sane.  See the
4453          * comments for these symbols in pg_control.h.
4454          */
4455         StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4456                                          "pg_control is too large for atomic disk writes");
4457         StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4458                                          "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4459
4460         /*
4461          * Initialize version and compatibility-check fields
4462          */
4463         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4464         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4465
4466         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4467         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4468
4469         ControlFile->blcksz = BLCKSZ;
4470         ControlFile->relseg_size = RELSEG_SIZE;
4471         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4472         ControlFile->xlog_seg_size = wal_segment_size;
4473
4474         ControlFile->nameDataLen = NAMEDATALEN;
4475         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4476
4477         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4478         ControlFile->loblksize = LOBLKSIZE;
4479
4480         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4481         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4482
4483         /* Contents are protected with a CRC */
4484         INIT_CRC32C(ControlFile->crc);
4485         COMP_CRC32C(ControlFile->crc,
4486                                 (char *) ControlFile,
4487                                 offsetof(ControlFileData, crc));
4488         FIN_CRC32C(ControlFile->crc);
4489
4490         /*
4491          * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4492          * the excess over sizeof(ControlFileData).  This reduces the odds of
4493          * premature-EOF errors when reading pg_control.  We'll still fail when we
4494          * check the contents of the file, but hopefully with a more specific
4495          * error than "couldn't read pg_control".
4496          */
4497         memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4498         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4499
4500         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4501                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4502         if (fd < 0)
4503                 ereport(PANIC,
4504                                 (errcode_for_file_access(),
4505                                  errmsg("could not create file \"%s\": %m",
4506                                                 XLOG_CONTROL_FILE)));
4507
4508         errno = 0;
4509         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4510         if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4511         {
4512                 /* if write didn't set errno, assume problem is no disk space */
4513                 if (errno == 0)
4514                         errno = ENOSPC;
4515                 ereport(PANIC,
4516                                 (errcode_for_file_access(),
4517                                  errmsg("could not write to file \"%s\": %m",
4518                                                 XLOG_CONTROL_FILE)));
4519         }
4520         pgstat_report_wait_end();
4521
4522         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4523         if (pg_fsync(fd) != 0)
4524                 ereport(PANIC,
4525                                 (errcode_for_file_access(),
4526                                  errmsg("could not fsync file \"%s\": %m",
4527                                                 XLOG_CONTROL_FILE)));
4528         pgstat_report_wait_end();
4529
4530         if (close(fd))
4531                 ereport(PANIC,
4532                                 (errcode_for_file_access(),
4533                                  errmsg("could not close file \"%s\": %m",
4534                                                 XLOG_CONTROL_FILE)));
4535 }
4536
4537 static void
4538 ReadControlFile(void)
4539 {
4540         pg_crc32c       crc;
4541         int                     fd;
4542         static char wal_segsz_str[20];
4543         int                     r;
4544
4545         /*
4546          * Read data...
4547          */
4548         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4549                                            O_RDWR | PG_BINARY);
4550         if (fd < 0)
4551                 ereport(PANIC,
4552                                 (errcode_for_file_access(),
4553                                  errmsg("could not open file \"%s\": %m",
4554                                                 XLOG_CONTROL_FILE)));
4555
4556         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4557         r = read(fd, ControlFile, sizeof(ControlFileData));
4558         if (r != sizeof(ControlFileData))
4559         {
4560                 if (r < 0)
4561                         ereport(PANIC,
4562                                         (errcode_for_file_access(),
4563                                          errmsg("could not read file \"%s\": %m",
4564                                                         XLOG_CONTROL_FILE)));
4565                 else
4566                         ereport(PANIC,
4567                                         (errcode(ERRCODE_DATA_CORRUPTED),
4568                                          errmsg("could not read file \"%s\": read %d of %zu",
4569                                                         XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4570         }
4571         pgstat_report_wait_end();
4572
4573         close(fd);
4574
4575         /*
4576          * Check for expected pg_control format version.  If this is wrong, the
4577          * CRC check will likely fail because we'll be checking the wrong number
4578          * of bytes.  Complaining about wrong version will probably be more
4579          * enlightening than complaining about wrong CRC.
4580          */
4581
4582         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4583                 ereport(FATAL,
4584                                 (errmsg("database files are incompatible with server"),
4585                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4586                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4587                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
4588                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4589                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4590
4591         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4592                 ereport(FATAL,
4593                                 (errmsg("database files are incompatible with server"),
4594                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4595                                                    " but the server was compiled with PG_CONTROL_VERSION %d.",
4596                                                    ControlFile->pg_control_version, PG_CONTROL_VERSION),
4597                                  errhint("It looks like you need to initdb.")));
4598
4599         /* Now check the CRC. */
4600         INIT_CRC32C(crc);
4601         COMP_CRC32C(crc,
4602                                 (char *) ControlFile,
4603                                 offsetof(ControlFileData, crc));
4604         FIN_CRC32C(crc);
4605
4606         if (!EQ_CRC32C(crc, ControlFile->crc))
4607                 ereport(FATAL,
4608                                 (errmsg("incorrect checksum in control file")));
4609
4610         /*
4611          * Do compatibility checking immediately.  If the database isn't
4612          * compatible with the backend executable, we want to abort before we can
4613          * possibly do any damage.
4614          */
4615         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4616                 ereport(FATAL,
4617                                 (errmsg("database files are incompatible with server"),
4618                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4619                                                    " but the server was compiled with CATALOG_VERSION_NO %d.",
4620                                                    ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4621                                  errhint("It looks like you need to initdb.")));
4622         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4623                 ereport(FATAL,
4624                                 (errmsg("database files are incompatible with server"),
4625                                  errdetail("The database cluster was initialized with MAXALIGN %d,"
4626                                                    " but the server was compiled with MAXALIGN %d.",
4627                                                    ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4628                                  errhint("It looks like you need to initdb.")));
4629         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4630                 ereport(FATAL,
4631                                 (errmsg("database files are incompatible with server"),
4632                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4633                                  errhint("It looks like you need to initdb.")));
4634         if (ControlFile->blcksz != BLCKSZ)
4635                 ereport(FATAL,
4636                                 (errmsg("database files are incompatible with server"),
4637                                  errdetail("The database cluster was initialized with BLCKSZ %d,"
4638                                                    " but the server was compiled with BLCKSZ %d.",
4639                                                    ControlFile->blcksz, BLCKSZ),
4640                                  errhint("It looks like you need to recompile or initdb.")));
4641         if (ControlFile->relseg_size != RELSEG_SIZE)
4642                 ereport(FATAL,
4643                                 (errmsg("database files are incompatible with server"),
4644                                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4645                                                    " but the server was compiled with RELSEG_SIZE %d.",
4646                                                    ControlFile->relseg_size, RELSEG_SIZE),
4647                                  errhint("It looks like you need to recompile or initdb.")));
4648         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4649                 ereport(FATAL,
4650                                 (errmsg("database files are incompatible with server"),
4651                                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4652                                                    " but the server was compiled with XLOG_BLCKSZ %d.",
4653                                                    ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4654                                  errhint("It looks like you need to recompile or initdb.")));
4655         if (ControlFile->nameDataLen != NAMEDATALEN)
4656                 ereport(FATAL,
4657                                 (errmsg("database files are incompatible with server"),
4658                                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4659                                                    " but the server was compiled with NAMEDATALEN %d.",
4660                                                    ControlFile->nameDataLen, NAMEDATALEN),
4661                                  errhint("It looks like you need to recompile or initdb.")));
4662         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4663                 ereport(FATAL,
4664                                 (errmsg("database files are incompatible with server"),
4665                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4666                                                    " but the server was compiled with INDEX_MAX_KEYS %d.",
4667                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4668                                  errhint("It looks like you need to recompile or initdb.")));
4669         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4670                 ereport(FATAL,
4671                                 (errmsg("database files are incompatible with server"),
4672                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4673                                                    " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4674                                                    ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4675                                  errhint("It looks like you need to recompile or initdb.")));
4676         if (ControlFile->loblksize != LOBLKSIZE)
4677                 ereport(FATAL,
4678                                 (errmsg("database files are incompatible with server"),
4679                                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4680                                                    " but the server was compiled with LOBLKSIZE %d.",
4681                                                    ControlFile->loblksize, (int) LOBLKSIZE),
4682                                  errhint("It looks like you need to recompile or initdb.")));
4683
4684 #ifdef USE_FLOAT4_BYVAL
4685         if (ControlFile->float4ByVal != true)
4686                 ereport(FATAL,
4687                                 (errmsg("database files are incompatible with server"),
4688                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4689                                                    " but the server was compiled with USE_FLOAT4_BYVAL."),
4690                                  errhint("It looks like you need to recompile or initdb.")));
4691 #else
4692         if (ControlFile->float4ByVal != false)
4693                 ereport(FATAL,
4694                                 (errmsg("database files are incompatible with server"),
4695                                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4696                                                    " but the server was compiled without USE_FLOAT4_BYVAL."),
4697                                  errhint("It looks like you need to recompile or initdb.")));
4698 #endif
4699
4700 #ifdef USE_FLOAT8_BYVAL
4701         if (ControlFile->float8ByVal != true)
4702                 ereport(FATAL,
4703                                 (errmsg("database files are incompatible with server"),
4704                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4705                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4706                                  errhint("It looks like you need to recompile or initdb.")));
4707 #else
4708         if (ControlFile->float8ByVal != false)
4709                 ereport(FATAL,
4710                                 (errmsg("database files are incompatible with server"),
4711                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4712                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4713                                  errhint("It looks like you need to recompile or initdb.")));
4714 #endif
4715
4716         wal_segment_size = ControlFile->xlog_seg_size;
4717
4718         if (!IsValidWalSegSize(wal_segment_size))
4719                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4720                                                 errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4721                                                                           "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4722                                                                           wal_segment_size,
4723                                                                           wal_segment_size)));
4724
4725         snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4726         SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4727                                         PGC_S_OVERRIDE);
4728
4729         /* check and update variables dependent on wal_segment_size */
4730         if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4731                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4732                                                 errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\".")));
4733
4734         if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4735                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4736                                                 errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\".")));
4737
4738         UsableBytesInSegment =
4739                 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4740                 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4741
4742         CalculateCheckpointSegments();
4743
4744         /* Make the initdb settings visible as GUC variables, too */
4745         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4746                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4747 }
4748
4749 void
4750 UpdateControlFile(void)
4751 {
4752         int                     fd;
4753
4754         INIT_CRC32C(ControlFile->crc);
4755         COMP_CRC32C(ControlFile->crc,
4756                                 (char *) ControlFile,
4757                                 offsetof(ControlFileData, crc));
4758         FIN_CRC32C(ControlFile->crc);
4759
4760         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4761                                            O_RDWR | PG_BINARY);
4762         if (fd < 0)
4763                 ereport(PANIC,
4764                                 (errcode_for_file_access(),
4765                                  errmsg("could not open file \"%s\": %m", XLOG_CONTROL_FILE)));
4766
4767         errno = 0;
4768         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
4769         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4770         {
4771                 /* if write didn't set errno, assume problem is no disk space */
4772                 if (errno == 0)
4773                         errno = ENOSPC;
4774                 ereport(PANIC,
4775                                 (errcode_for_file_access(),
4776                                  errmsg("could not write to file \"%s\": %m",
4777                                                 XLOG_CONTROL_FILE)));
4778         }
4779         pgstat_report_wait_end();
4780
4781         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
4782         if (pg_fsync(fd) != 0)
4783                 ereport(PANIC,
4784                                 (errcode_for_file_access(),
4785                                  errmsg("could not fsync file \"%s\": %m",
4786                                                 XLOG_CONTROL_FILE)));
4787         pgstat_report_wait_end();
4788
4789         if (close(fd))
4790                 ereport(PANIC,
4791                                 (errcode_for_file_access(),
4792                                  errmsg("could not close file \"%s\": %m",
4793                                                 XLOG_CONTROL_FILE)));
4794 }
4795
4796 /*
4797  * Returns the unique system identifier from control file.
4798  */
4799 uint64
4800 GetSystemIdentifier(void)
4801 {
4802         Assert(ControlFile != NULL);
4803         return ControlFile->system_identifier;
4804 }
4805
4806 /*
4807  * Returns the random nonce from control file.
4808  */
4809 char *
4810 GetMockAuthenticationNonce(void)
4811 {
4812         Assert(ControlFile != NULL);
4813         return ControlFile->mock_authentication_nonce;
4814 }
4815
4816 /*
4817  * Are checksums enabled for data pages?
4818  */
4819 bool
4820 DataChecksumsEnabled(void)
4821 {
4822         Assert(ControlFile != NULL);
4823         return (ControlFile->data_checksum_version > 0);
4824 }
4825
4826 /*
4827  * Returns a fake LSN for unlogged relations.
4828  *
4829  * Each call generates an LSN that is greater than any previous value
4830  * returned. The current counter value is saved and restored across clean
4831  * shutdowns, but like unlogged relations, does not survive a crash. This can
4832  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4833  * LSN-like increasing sequence of numbers without writing any WAL.
4834  */
4835 XLogRecPtr
4836 GetFakeLSNForUnloggedRel(void)
4837 {
4838         XLogRecPtr      nextUnloggedLSN;
4839
4840         /* increment the unloggedLSN counter, need SpinLock */
4841         SpinLockAcquire(&XLogCtl->ulsn_lck);
4842         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4843         SpinLockRelease(&XLogCtl->ulsn_lck);
4844
4845         return nextUnloggedLSN;
4846 }
4847
4848 /*
4849  * Auto-tune the number of XLOG buffers.
4850  *
4851  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4852  * a maximum of one XLOG segment (there is little reason to think that more
4853  * is helpful, at least so long as we force an fsync when switching log files)
4854  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4855  * 9.1, when auto-tuning was added).
4856  *
4857  * This should not be called until NBuffers has received its final value.
4858  */
4859 static int
4860 XLOGChooseNumBuffers(void)
4861 {
4862         int                     xbuffers;
4863
4864         xbuffers = NBuffers / 32;
4865         if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4866                 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4867         if (xbuffers < 8)
4868                 xbuffers = 8;
4869         return xbuffers;
4870 }
4871
4872 /*
4873  * GUC check_hook for wal_buffers
4874  */
4875 bool
4876 check_wal_buffers(int *newval, void **extra, GucSource source)
4877 {
4878         /*
4879          * -1 indicates a request for auto-tune.
4880          */
4881         if (*newval == -1)
4882         {
4883                 /*
4884                  * If we haven't yet changed the boot_val default of -1, just let it
4885                  * be.  We'll fix it when XLOGShmemSize is called.
4886                  */
4887                 if (XLOGbuffers == -1)
4888                         return true;
4889
4890                 /* Otherwise, substitute the auto-tune value */
4891                 *newval = XLOGChooseNumBuffers();
4892         }
4893
4894         /*
4895          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4896          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4897          * the case, we just silently treat such values as a request for the
4898          * minimum.  (We could throw an error instead, but that doesn't seem very
4899          * helpful.)
4900          */
4901         if (*newval < 4)
4902                 *newval = 4;
4903
4904         return true;
4905 }
4906
4907 /*
4908  * Read the control file, set respective GUCs.
4909  *
4910  * This is to be called during startup, including a crash recovery cycle,
4911  * unless in bootstrap mode, where no control file yet exists.  As there's no
4912  * usable shared memory yet (its sizing can depend on the contents of the
4913  * control file!), first store the contents in local memory. XLOGShmemInit()
4914  * will then copy it to shared memory later.
4915  *
4916  * reset just controls whether previous contents are to be expected (in the
4917  * reset case, there's a dangling pointer into old shared memory), or not.
4918  */
4919 void
4920 LocalProcessControlFile(bool reset)
4921 {
4922         Assert(reset || ControlFile == NULL);
4923         ControlFile = palloc(sizeof(ControlFileData));
4924         ReadControlFile();
4925 }
4926
4927 /*
4928  * Initialization of shared memory for XLOG
4929  */
4930 Size
4931 XLOGShmemSize(void)
4932 {
4933         Size            size;
4934
4935         /*
4936          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4937          * This isn't an amazingly clean place to do this, but we must wait till
4938          * NBuffers has received its final value, and must do it before using the
4939          * value of XLOGbuffers to do anything important.
4940          */
4941         if (XLOGbuffers == -1)
4942         {
4943                 char            buf[32];
4944
4945                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4946                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4947         }
4948         Assert(XLOGbuffers > 0);
4949
4950         /* XLogCtl */
4951         size = sizeof(XLogCtlData);
4952
4953         /* WAL insertion locks, plus alignment */
4954         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4955         /* xlblocks array */
4956         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4957         /* extra alignment padding for XLOG I/O buffers */
4958         size = add_size(size, XLOG_BLCKSZ);
4959         /* and the buffers themselves */
4960         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4961
4962         /*
4963          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4964          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4965          * routine again below to compute the actual allocation size.
4966          */
4967
4968         return size;
4969 }
4970
4971 void
4972 XLOGShmemInit(void)
4973 {
4974         bool            foundCFile,
4975                                 foundXLog;
4976         char       *allocptr;
4977         int                     i;
4978         ControlFileData *localControlFile;
4979
4980 #ifdef WAL_DEBUG
4981
4982         /*
4983          * Create a memory context for WAL debugging that's exempt from the normal
4984          * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4985          * an allocation fails, but wal_debug is not for production use anyway.
4986          */
4987         if (walDebugCxt == NULL)
4988         {
4989                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4990                                                                                         "WAL Debug",
4991                                                                                         ALLOCSET_DEFAULT_SIZES);
4992                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4993         }
4994 #endif
4995
4996
4997         XLogCtl = (XLogCtlData *)
4998                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4999
5000         localControlFile = ControlFile;
5001         ControlFile = (ControlFileData *)
5002                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5003
5004         if (foundCFile || foundXLog)
5005         {
5006                 /* both should be present or neither */
5007                 Assert(foundCFile && foundXLog);
5008
5009                 /* Initialize local copy of WALInsertLocks and register the tranche */
5010                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5011                 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
5012                                                           "wal_insert");
5013
5014                 if (localControlFile)
5015                         pfree(localControlFile);
5016                 return;
5017         }
5018         memset(XLogCtl, 0, sizeof(XLogCtlData));
5019
5020         /*
5021          * Already have read control file locally, unless in bootstrap mode. Move
5022          * contents into shared memory.
5023          */
5024         if (localControlFile)
5025         {
5026                 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5027                 pfree(localControlFile);
5028         }
5029
5030         /*
5031          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5032          * multiple of the alignment for same, so no extra alignment padding is
5033          * needed here.
5034          */
5035         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5036         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5037         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5038         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5039
5040
5041         /* WAL insertion locks. Ensure they're aligned to the full padded size */
5042         allocptr += sizeof(WALInsertLockPadded) -
5043                 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5044         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5045                 (WALInsertLockPadded *) allocptr;
5046         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5047
5048         LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5049         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5050         {
5051                 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5052                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5053                 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5054         }
5055
5056         /*
5057          * Align the start of the page buffers to a full xlog block size boundary.
5058          * This simplifies some calculations in XLOG insertion. It is also
5059          * required for O_DIRECT.
5060          */
5061         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5062         XLogCtl->pages = allocptr;
5063         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5064
5065         /*
5066          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5067          * in additional info.)
5068          */
5069         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5070         XLogCtl->SharedRecoveryInProgress = true;
5071         XLogCtl->SharedHotStandbyActive = false;
5072         XLogCtl->WalWriterSleeping = false;
5073
5074         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5075         SpinLockInit(&XLogCtl->info_lck);
5076         SpinLockInit(&XLogCtl->ulsn_lck);
5077         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5078 }
5079
5080 /*
5081  * This func must be called ONCE on system install.  It creates pg_control
5082  * and the initial XLOG segment.
5083  */
5084 void
5085 BootStrapXLOG(void)
5086 {
5087         CheckPoint      checkPoint;
5088         char       *buffer;
5089         XLogPageHeader page;
5090         XLogLongPageHeader longpage;
5091         XLogRecord *record;
5092         char       *recptr;
5093         bool            use_existent;
5094         uint64          sysidentifier;
5095         char            mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5096         struct timeval tv;
5097         pg_crc32c       crc;
5098
5099         /*
5100          * Select a hopefully-unique system identifier code for this installation.
5101          * We use the result of gettimeofday(), including the fractional seconds
5102          * field, as being about as unique as we can easily get.  (Think not to
5103          * use random(), since it hasn't been seeded and there's no portable way
5104          * to seed it other than the system clock value...)  The upper half of the
5105          * uint64 value is just the tv_sec part, while the lower half contains the
5106          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5107          * PID for a little extra uniqueness.  A person knowing this encoding can
5108          * determine the initialization time of the installation, which could
5109          * perhaps be useful sometimes.
5110          */
5111         gettimeofday(&tv, NULL);
5112         sysidentifier = ((uint64) tv.tv_sec) << 32;
5113         sysidentifier |= ((uint64) tv.tv_usec) << 12;
5114         sysidentifier |= getpid() & 0xFFF;
5115
5116         /*
5117          * Generate a random nonce. This is used for authentication requests that
5118          * will fail because the user does not exist. The nonce is used to create
5119          * a genuine-looking password challenge for the non-existent user, in lieu
5120          * of an actual stored password.
5121          */
5122         if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5123                 ereport(PANIC,
5124                                 (errcode(ERRCODE_INTERNAL_ERROR),
5125                                  errmsg("could not generate secret authorization token")));
5126
5127         /* First timeline ID is always 1 */
5128         ThisTimeLineID = 1;
5129
5130         /* page buffer must be aligned suitably for O_DIRECT */
5131         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5132         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5133         memset(page, 0, XLOG_BLCKSZ);
5134
5135         /*
5136          * Set up information for the initial checkpoint record
5137          *
5138          * The initial checkpoint record is written to the beginning of the WAL
5139          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5140          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5141          */
5142         checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5143         checkPoint.ThisTimeLineID = ThisTimeLineID;
5144         checkPoint.PrevTimeLineID = ThisTimeLineID;
5145         checkPoint.fullPageWrites = fullPageWrites;
5146         checkPoint.nextXidEpoch = 0;
5147         checkPoint.nextXid = FirstNormalTransactionId;
5148         checkPoint.nextOid = FirstBootstrapObjectId;
5149         checkPoint.nextMulti = FirstMultiXactId;
5150         checkPoint.nextMultiOffset = 0;
5151         checkPoint.oldestXid = FirstNormalTransactionId;
5152         checkPoint.oldestXidDB = TemplateDbOid;
5153         checkPoint.oldestMulti = FirstMultiXactId;
5154         checkPoint.oldestMultiDB = TemplateDbOid;
5155         checkPoint.oldestCommitTsXid = InvalidTransactionId;
5156         checkPoint.newestCommitTsXid = InvalidTransactionId;
5157         checkPoint.time = (pg_time_t) time(NULL);
5158         checkPoint.oldestActiveXid = InvalidTransactionId;
5159
5160         ShmemVariableCache->nextXid = checkPoint.nextXid;
5161         ShmemVariableCache->nextOid = checkPoint.nextOid;
5162         ShmemVariableCache->oidCount = 0;
5163         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5164         AdvanceOldestClogXid(checkPoint.oldestXid);
5165         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5166         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5167         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5168
5169         /* Set up the XLOG page header */
5170         page->xlp_magic = XLOG_PAGE_MAGIC;
5171         page->xlp_info = XLP_LONG_HEADER;
5172         page->xlp_tli = ThisTimeLineID;
5173         page->xlp_pageaddr = wal_segment_size;
5174         longpage = (XLogLongPageHeader) page;
5175         longpage->xlp_sysid = sysidentifier;
5176         longpage->xlp_seg_size = wal_segment_size;
5177         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5178
5179         /* Insert the initial checkpoint record */
5180         recptr = ((char *) page + SizeOfXLogLongPHD);
5181         record = (XLogRecord *) recptr;
5182         record->xl_prev = 0;
5183         record->xl_xid = InvalidTransactionId;
5184         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5185         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5186         record->xl_rmid = RM_XLOG_ID;
5187         recptr += SizeOfXLogRecord;
5188         /* fill the XLogRecordDataHeaderShort struct */
5189         *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5190         *(recptr++) = sizeof(checkPoint);
5191         memcpy(recptr, &checkPoint, sizeof(checkPoint));
5192         recptr += sizeof(checkPoint);
5193         Assert(recptr - (char *) record == record->xl_tot_len);
5194
5195         INIT_CRC32C(crc);
5196         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5197         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5198         FIN_CRC32C(crc);
5199         record->xl_crc = crc;
5200
5201         /* Create first XLOG segment file */
5202         use_existent = false;
5203         openLogFile = XLogFileInit(1, &use_existent, false);
5204
5205         /* Write the first page with the initial record */
5206         errno = 0;
5207         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5208         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5209         {
5210                 /* if write didn't set errno, assume problem is no disk space */
5211                 if (errno == 0)
5212                         errno = ENOSPC;
5213                 ereport(PANIC,
5214                                 (errcode_for_file_access(),
5215                                  errmsg("could not write bootstrap write-ahead log file: %m")));
5216         }
5217         pgstat_report_wait_end();
5218
5219         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5220         if (pg_fsync(openLogFile) != 0)
5221                 ereport(PANIC,
5222                                 (errcode_for_file_access(),
5223                                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
5224         pgstat_report_wait_end();
5225
5226         if (close(openLogFile))
5227                 ereport(PANIC,
5228                                 (errcode_for_file_access(),
5229                                  errmsg("could not close bootstrap write-ahead log file: %m")));
5230
5231         openLogFile = -1;
5232
5233         /* Now create pg_control */
5234
5235         memset(ControlFile, 0, sizeof(ControlFileData));
5236         /* Initialize pg_control status fields */
5237         ControlFile->system_identifier = sysidentifier;
5238         memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5239         ControlFile->state = DB_SHUTDOWNED;
5240         ControlFile->time = checkPoint.time;
5241         ControlFile->checkPoint = checkPoint.redo;
5242         ControlFile->checkPointCopy = checkPoint;
5243         ControlFile->unloggedLSN = 1;
5244
5245         /* Set important parameter values for use when replaying WAL */
5246         ControlFile->MaxConnections = MaxConnections;
5247         ControlFile->max_worker_processes = max_worker_processes;
5248         ControlFile->max_prepared_xacts = max_prepared_xacts;
5249         ControlFile->max_locks_per_xact = max_locks_per_xact;
5250         ControlFile->wal_level = wal_level;
5251         ControlFile->wal_log_hints = wal_log_hints;
5252         ControlFile->track_commit_timestamp = track_commit_timestamp;
5253         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5254
5255         /* some additional ControlFile fields are set in WriteControlFile() */
5256
5257         WriteControlFile();
5258
5259         /* Bootstrap the commit log, too */
5260         BootStrapCLOG();
5261         BootStrapCommitTs();
5262         BootStrapSUBTRANS();
5263         BootStrapMultiXact();
5264
5265         pfree(buffer);
5266
5267         /*
5268          * Force control file to be read - in contrast to normal processing we'd
5269          * otherwise never run the checks and GUC related initializations therein.
5270          */
5271         ReadControlFile();
5272 }
5273
5274 static char *
5275 str_time(pg_time_t tnow)
5276 {
5277         static char buf[128];
5278
5279         pg_strftime(buf, sizeof(buf),
5280                                 "%Y-%m-%d %H:%M:%S %Z",
5281                                 pg_localtime(&tnow, log_timezone));
5282
5283         return buf;
5284 }
5285
5286 /*
5287  * See if there is a recovery command file (recovery.conf), and if so
5288  * read in parameters for archive recovery and XLOG streaming.
5289  *
5290  * The file is parsed using the main configuration parser.
5291  */
5292 static void
5293 readRecoveryCommandFile(void)
5294 {
5295         FILE       *fd;
5296         TimeLineID      rtli = 0;
5297         bool            rtliGiven = false;
5298         ConfigVariable *item,
5299                            *head = NULL,
5300                            *tail = NULL;
5301         bool            recoveryTargetActionSet = false;
5302
5303
5304         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5305         if (fd == NULL)
5306         {
5307                 if (errno == ENOENT)
5308                         return;                         /* not there, so no archive recovery */
5309                 ereport(FATAL,
5310                                 (errcode_for_file_access(),
5311                                  errmsg("could not open recovery command file \"%s\": %m",
5312                                                 RECOVERY_COMMAND_FILE)));
5313         }
5314
5315         /*
5316          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5317          * no need to check the return value.
5318          */
5319         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5320
5321         FreeFile(fd);
5322
5323         for (item = head; item; item = item->next)
5324         {
5325                 if (strcmp(item->name, "restore_command") == 0)
5326                 {
5327                         recoveryRestoreCommand = pstrdup(item->value);
5328                         ereport(DEBUG2,
5329                                         (errmsg_internal("restore_command = '%s'",
5330                                                                          recoveryRestoreCommand)));
5331                 }
5332                 else if (strcmp(item->name, "recovery_end_command") == 0)
5333                 {
5334                         recoveryEndCommand = pstrdup(item->value);
5335                         ereport(DEBUG2,
5336                                         (errmsg_internal("recovery_end_command = '%s'",
5337                                                                          recoveryEndCommand)));
5338                 }
5339                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5340                 {
5341                         archiveCleanupCommand = pstrdup(item->value);
5342                         ereport(DEBUG2,
5343                                         (errmsg_internal("archive_cleanup_command = '%s'",
5344                                                                          archiveCleanupCommand)));
5345                 }
5346                 else if (strcmp(item->name, "recovery_target_action") == 0)
5347                 {
5348                         if (strcmp(item->value, "pause") == 0)
5349                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5350                         else if (strcmp(item->value, "promote") == 0)
5351                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5352                         else if (strcmp(item->value, "shutdown") == 0)
5353                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5354                         else
5355                                 ereport(ERROR,
5356                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5357                                                  errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5358                                                                 "recovery_target_action",
5359                                                                 item->value),
5360                                                  errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5361
5362                         ereport(DEBUG2,
5363                                         (errmsg_internal("recovery_target_action = '%s'",
5364                                                                          item->value)));
5365
5366                         recoveryTargetActionSet = true;
5367                 }
5368                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5369                 {
5370                         rtliGiven = true;
5371                         if (strcmp(item->value, "latest") == 0)
5372                                 rtli = 0;
5373                         else
5374                         {
5375                                 errno = 0;
5376                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5377                                 if (errno == EINVAL || errno == ERANGE)
5378                                         ereport(FATAL,
5379                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5380                                                          errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5381                                                                         item->value)));
5382                         }
5383                         if (rtli)
5384                                 ereport(DEBUG2,
5385                                                 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5386                         else
5387                                 ereport(DEBUG2,
5388                                                 (errmsg_internal("recovery_target_timeline = latest")));
5389                 }
5390                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5391                 {
5392                         errno = 0;
5393                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5394                         if (errno == EINVAL || errno == ERANGE)
5395                                 ereport(FATAL,
5396                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5397                                                  errmsg("recovery_target_xid is not a valid number: \"%s\"",
5398                                                                 item->value)));
5399                         ereport(DEBUG2,
5400                                         (errmsg_internal("recovery_target_xid = %u",
5401                                                                          recoveryTargetXid)));
5402                         recoveryTarget = RECOVERY_TARGET_XID;
5403                 }
5404                 else if (strcmp(item->name, "recovery_target_time") == 0)
5405                 {
5406                         recoveryTarget = RECOVERY_TARGET_TIME;
5407
5408                         if (strcmp(item->value, "epoch") == 0 ||
5409                                 strcmp(item->value, "infinity") == 0 ||
5410                                 strcmp(item->value, "-infinity") == 0 ||
5411                                 strcmp(item->value, "now") == 0 ||
5412                                 strcmp(item->value, "today") == 0 ||
5413                                 strcmp(item->value, "tomorrow") == 0 ||
5414                                 strcmp(item->value, "yesterday") == 0)
5415                                 ereport(FATAL,
5416                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5417                                                  errmsg("recovery_target_time is not a valid timestamp: \"%s\"",
5418                                                                 item->value)));
5419
5420                         /*
5421                          * Convert the time string given by the user to TimestampTz form.
5422                          */
5423                         recoveryTargetTime =
5424                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5425                                                                                                                 CStringGetDatum(item->value),
5426                                                                                                                 ObjectIdGetDatum(InvalidOid),
5427                                                                                                                 Int32GetDatum(-1)));
5428                         ereport(DEBUG2,
5429                                         (errmsg_internal("recovery_target_time = '%s'",
5430                                                                          timestamptz_to_str(recoveryTargetTime))));
5431                 }
5432                 else if (strcmp(item->name, "recovery_target_name") == 0)
5433                 {
5434                         recoveryTarget = RECOVERY_TARGET_NAME;
5435
5436                         recoveryTargetName = pstrdup(item->value);
5437                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5438                                 ereport(FATAL,
5439                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5440                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5441                                                                 MAXFNAMELEN - 1)));
5442
5443                         ereport(DEBUG2,
5444                                         (errmsg_internal("recovery_target_name = '%s'",
5445                                                                          recoveryTargetName)));
5446                 }
5447                 else if (strcmp(item->name, "recovery_target_lsn") == 0)
5448                 {
5449                         recoveryTarget = RECOVERY_TARGET_LSN;
5450
5451                         /*
5452                          * Convert the LSN string given by the user to XLogRecPtr form.
5453                          */
5454                         recoveryTargetLSN =
5455                                 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5456                                                                                                 CStringGetDatum(item->value),
5457                                                                                                 ObjectIdGetDatum(InvalidOid),
5458                                                                                                 Int32GetDatum(-1)));
5459                         ereport(DEBUG2,
5460                                         (errmsg_internal("recovery_target_lsn = '%X/%X'",
5461                                                                          (uint32) (recoveryTargetLSN >> 32),
5462                                                                          (uint32) recoveryTargetLSN)));
5463                 }
5464                 else if (strcmp(item->name, "recovery_target") == 0)
5465                 {
5466                         if (strcmp(item->value, "immediate") == 0)
5467                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5468                         else
5469                                 ereport(ERROR,
5470                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5471                                                  errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5472                                                                 "recovery_target",
5473                                                                 item->value),
5474                                                  errhint("The only allowed value is \"immediate\".")));
5475                         ereport(DEBUG2,
5476                                         (errmsg_internal("recovery_target = '%s'",
5477                                                                          item->value)));
5478                 }
5479                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5480                 {
5481                         /*
5482                          * does nothing if a recovery_target is not also set
5483                          */
5484                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5485                                 ereport(ERROR,
5486                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5487                                                  errmsg("parameter \"%s\" requires a Boolean value",
5488                                                                 "recovery_target_inclusive")));
5489                         ereport(DEBUG2,
5490                                         (errmsg_internal("recovery_target_inclusive = %s",
5491                                                                          item->value)));
5492                 }
5493                 else if (strcmp(item->name, "standby_mode") == 0)
5494                 {
5495                         if (!parse_bool(item->value, &StandbyModeRequested))
5496                                 ereport(ERROR,
5497                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5498                                                  errmsg("parameter \"%s\" requires a Boolean value",
5499                                                                 "standby_mode")));
5500                         ereport(DEBUG2,
5501                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5502                 }
5503                 else if (strcmp(item->name, "primary_conninfo") == 0)
5504                 {
5505                         PrimaryConnInfo = pstrdup(item->value);
5506                         ereport(DEBUG2,
5507                                         (errmsg_internal("primary_conninfo = '%s'",
5508                                                                          PrimaryConnInfo)));
5509                 }
5510                 else if (strcmp(item->name, "primary_slot_name") == 0)
5511                 {
5512                         ReplicationSlotValidateName(item->value, ERROR);
5513                         PrimarySlotName = pstrdup(item->value);
5514                         ereport(DEBUG2,
5515                                         (errmsg_internal("primary_slot_name = '%s'",
5516                                                                          PrimarySlotName)));
5517                 }
5518                 else if (strcmp(item->name, "trigger_file") == 0)
5519                 {
5520                         TriggerFile = pstrdup(item->value);
5521                         ereport(DEBUG2,
5522                                         (errmsg_internal("trigger_file = '%s'",
5523                                                                          TriggerFile)));
5524                 }
5525                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5526                 {
5527                         const char *hintmsg;
5528
5529                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5530                                                    &hintmsg))
5531                                 ereport(ERROR,
5532                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5533                                                  errmsg("parameter \"%s\" requires a temporal value",
5534                                                                 "recovery_min_apply_delay"),
5535                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5536                         ereport(DEBUG2,
5537                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5538                 }
5539                 else
5540                         ereport(FATAL,
5541                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5542                                          errmsg("unrecognized recovery parameter \"%s\"",
5543                                                         item->name)));
5544         }
5545
5546         /*
5547          * Check for compulsory parameters
5548          */
5549         if (StandbyModeRequested)
5550         {
5551                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5552                         ereport(WARNING,
5553                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5554                                                         RECOVERY_COMMAND_FILE),
5555                                          errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5556         }
5557         else
5558         {
5559                 if (recoveryRestoreCommand == NULL)
5560                         ereport(FATAL,
5561                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5562                                          errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5563                                                         RECOVERY_COMMAND_FILE)));
5564         }
5565
5566         /*
5567          * Override any inconsistent requests. Not that this is a change of
5568          * behaviour in 9.5; prior to this we simply ignored a request to pause if
5569          * hot_standby = off, which was surprising behaviour.
5570          */
5571         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5572                 recoveryTargetActionSet &&
5573                 !EnableHotStandby)
5574                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5575
5576         /*
5577          * We don't support standby_mode in standalone backends; that requires
5578          * other processes such as the WAL receiver to be alive.
5579          */
5580         if (StandbyModeRequested && !IsUnderPostmaster)
5581                 ereport(FATAL,
5582                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5583                                  errmsg("standby mode is not supported by single-user servers")));
5584
5585         /* Enable fetching from archive recovery area */
5586         ArchiveRecoveryRequested = true;
5587
5588         /*
5589          * If user specified recovery_target_timeline, validate it or compute the
5590          * "latest" value.  We can't do this until after we've gotten the restore
5591          * command and set InArchiveRecovery, because we need to fetch timeline
5592          * history files from the archive.
5593          */
5594         if (rtliGiven)
5595         {
5596                 if (rtli)
5597                 {
5598                         /* Timeline 1 does not have a history file, all else should */
5599                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5600                                 ereport(FATAL,
5601                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5602                                                  errmsg("recovery target timeline %u does not exist",
5603                                                                 rtli)));
5604                         recoveryTargetTLI = rtli;
5605                         recoveryTargetIsLatest = false;
5606                 }
5607                 else
5608                 {
5609                         /* We start the "latest" search from pg_control's timeline */
5610                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5611                         recoveryTargetIsLatest = true;
5612                 }
5613         }
5614
5615         FreeConfigVariables(head);
5616 }
5617
5618 /*
5619  * Exit archive-recovery state
5620  */
5621 static void
5622 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5623 {
5624         char            recoveryPath[MAXPGPATH];
5625         char            xlogfname[MAXFNAMELEN];
5626         XLogSegNo       endLogSegNo;
5627         XLogSegNo       startLogSegNo;
5628
5629         /* we always switch to a new timeline after archive recovery */
5630         Assert(endTLI != ThisTimeLineID);
5631
5632         /*
5633          * We are no longer in archive recovery state.
5634          */
5635         InArchiveRecovery = false;
5636
5637         /*
5638          * Update min recovery point one last time.
5639          */
5640         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5641
5642         /*
5643          * If the ending log segment is still open, close it (to avoid problems on
5644          * Windows with trying to rename or delete an open file).
5645          */
5646         if (readFile >= 0)
5647         {
5648                 close(readFile);
5649                 readFile = -1;
5650         }
5651
5652         /*
5653          * Calculate the last segment on the old timeline, and the first segment
5654          * on the new timeline. If the switch happens in the middle of a segment,
5655          * they are the same, but if the switch happens exactly at a segment
5656          * boundary, startLogSegNo will be endLogSegNo + 1.
5657          */
5658         XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5659         XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5660
5661         /*
5662          * Initialize the starting WAL segment for the new timeline. If the switch
5663          * happens in the middle of a segment, copy data from the last WAL segment
5664          * of the old timeline up to the switch point, to the starting WAL segment
5665          * on the new timeline.
5666          */
5667         if (endLogSegNo == startLogSegNo)
5668         {
5669                 /*
5670                  * Make a copy of the file on the new timeline.
5671                  *
5672                  * Writing WAL isn't allowed yet, so there are no locking
5673                  * considerations. But we should be just as tense as XLogFileInit to
5674                  * avoid emplacing a bogus file.
5675                  */
5676                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5677                                          XLogSegmentOffset(endOfLog, wal_segment_size));
5678         }
5679         else
5680         {
5681                 /*
5682                  * The switch happened at a segment boundary, so just create the next
5683                  * segment on the new timeline.
5684                  */
5685                 bool            use_existent = true;
5686                 int                     fd;
5687
5688                 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5689
5690                 if (close(fd))
5691                         ereport(ERROR,
5692                                         (errcode_for_file_access(),
5693                                          errmsg("could not close file \"%s\": %m",
5694                                                         XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5695         }
5696
5697         /*
5698          * Let's just make real sure there are not .ready or .done flags posted
5699          * for the new segment.
5700          */
5701         XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5702         XLogArchiveCleanup(xlogfname);
5703
5704         /*
5705          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5706          * of it.
5707          */
5708         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5709         unlink(recoveryPath);           /* ignore any error */
5710
5711         /* Get rid of any remaining recovered timeline-history file, too */
5712         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5713         unlink(recoveryPath);           /* ignore any error */
5714
5715         /*
5716          * Rename the config file out of the way, so that we don't accidentally
5717          * re-enter archive recovery mode in a subsequent crash.
5718          */
5719         unlink(RECOVERY_COMMAND_DONE);
5720         durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5721
5722         ereport(LOG,
5723                         (errmsg("archive recovery complete")));
5724 }
5725
5726 /*
5727  * Extract timestamp from WAL record.
5728  *
5729  * If the record contains a timestamp, returns true, and saves the timestamp
5730  * in *recordXtime. If the record type has no timestamp, returns false.
5731  * Currently, only transaction commit/abort records and restore points contain
5732  * timestamps.
5733  */
5734 static bool
5735 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5736 {
5737         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5738         uint8           xact_info = info & XLOG_XACT_OPMASK;
5739         uint8           rmid = XLogRecGetRmid(record);
5740
5741         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5742         {
5743                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5744                 return true;
5745         }
5746         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5747                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
5748         {
5749                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5750                 return true;
5751         }
5752         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5753                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
5754         {
5755                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5756                 return true;
5757         }
5758         return false;
5759 }
5760
5761 /*
5762  * For point-in-time recovery, this function decides whether we want to
5763  * stop applying the XLOG before the current record.
5764  *
5765  * Returns true if we are stopping, false otherwise. If stopping, some
5766  * information is saved in recoveryStopXid et al for use in annotating the
5767  * new timeline's history file.
5768  */
5769 static bool
5770 recoveryStopsBefore(XLogReaderState *record)
5771 {
5772         bool            stopsHere = false;
5773         uint8           xact_info;
5774         bool            isCommit;
5775         TimestampTz recordXtime = 0;
5776         TransactionId recordXid;
5777
5778         /* Check if we should stop as soon as reaching consistency */
5779         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5780         {
5781                 ereport(LOG,
5782                                 (errmsg("recovery stopping after reaching consistency")));
5783
5784                 recoveryStopAfter = false;
5785                 recoveryStopXid = InvalidTransactionId;
5786                 recoveryStopLSN = InvalidXLogRecPtr;
5787                 recoveryStopTime = 0;
5788                 recoveryStopName[0] = '\0';
5789                 return true;
5790         }
5791
5792         /* Check if target LSN has been reached */
5793         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5794                 !recoveryTargetInclusive &&
5795                 record->ReadRecPtr >= recoveryTargetLSN)
5796         {
5797                 recoveryStopAfter = false;
5798                 recoveryStopXid = InvalidTransactionId;
5799                 recoveryStopLSN = record->ReadRecPtr;
5800                 recoveryStopTime = 0;
5801                 recoveryStopName[0] = '\0';
5802                 ereport(LOG,
5803                                 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5804                                                 (uint32) (recoveryStopLSN >> 32),
5805                                                 (uint32) recoveryStopLSN)));
5806                 return true;
5807         }
5808
5809         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5810         if (XLogRecGetRmid(record) != RM_XACT_ID)
5811                 return false;
5812
5813         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5814
5815         if (xact_info == XLOG_XACT_COMMIT)
5816         {
5817                 isCommit = true;
5818                 recordXid = XLogRecGetXid(record);
5819         }
5820         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5821         {
5822                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5823                 xl_xact_parsed_commit parsed;
5824
5825                 isCommit = true;
5826                 ParseCommitRecord(XLogRecGetInfo(record),
5827                                                   xlrec,
5828                                                   &parsed);
5829                 recordXid = parsed.twophase_xid;
5830         }
5831         else if (xact_info == XLOG_XACT_ABORT)
5832         {
5833                 isCommit = false;
5834                 recordXid = XLogRecGetXid(record);
5835         }
5836         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5837         {
5838                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5839                 xl_xact_parsed_abort parsed;
5840
5841                 isCommit = true;
5842                 ParseAbortRecord(XLogRecGetInfo(record),
5843                                                  xlrec,
5844                                                  &parsed);
5845                 recordXid = parsed.twophase_xid;
5846         }
5847         else
5848                 return false;
5849
5850         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5851         {
5852                 /*
5853                  * There can be only one transaction end record with this exact
5854                  * transactionid
5855                  *
5856                  * when testing for an xid, we MUST test for equality only, since
5857                  * transactions are numbered in the order they start, not the order
5858                  * they complete. A higher numbered xid will complete before you about
5859                  * 50% of the time...
5860                  */
5861                 stopsHere = (recordXid == recoveryTargetXid);
5862         }
5863
5864         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5865                 getRecordTimestamp(record, &recordXtime))
5866         {
5867                 /*
5868                  * There can be many transactions that share the same commit time, so
5869                  * we stop after the last one, if we are inclusive, or stop at the
5870                  * first one if we are exclusive
5871                  */
5872                 if (recoveryTargetInclusive)
5873                         stopsHere = (recordXtime > recoveryTargetTime);
5874                 else
5875                         stopsHere = (recordXtime >= recoveryTargetTime);
5876         }
5877
5878         if (stopsHere)
5879         {
5880                 recoveryStopAfter = false;
5881                 recoveryStopXid = recordXid;
5882                 recoveryStopTime = recordXtime;
5883                 recoveryStopLSN = InvalidXLogRecPtr;
5884                 recoveryStopName[0] = '\0';
5885
5886                 if (isCommit)
5887                 {
5888                         ereport(LOG,
5889                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5890                                                         recoveryStopXid,
5891                                                         timestamptz_to_str(recoveryStopTime))));
5892                 }
5893                 else
5894                 {
5895                         ereport(LOG,
5896                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5897                                                         recoveryStopXid,
5898                                                         timestamptz_to_str(recoveryStopTime))));
5899                 }
5900         }
5901
5902         return stopsHere;
5903 }
5904
5905 /*
5906  * Same as recoveryStopsBefore, but called after applying the record.
5907  *
5908  * We also track the timestamp of the latest applied COMMIT/ABORT
5909  * record in XLogCtl->recoveryLastXTime.
5910  */
5911 static bool
5912 recoveryStopsAfter(XLogReaderState *record)
5913 {
5914         uint8           info;
5915         uint8           xact_info;
5916         uint8           rmid;
5917         TimestampTz recordXtime;
5918
5919         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5920         rmid = XLogRecGetRmid(record);
5921
5922         /*
5923          * There can be many restore points that share the same name; we stop at
5924          * the first one.
5925          */
5926         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5927                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5928         {
5929                 xl_restore_point *recordRestorePointData;
5930
5931                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5932
5933                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5934                 {
5935                         recoveryStopAfter = true;
5936                         recoveryStopXid = InvalidTransactionId;
5937                         recoveryStopLSN = InvalidXLogRecPtr;
5938                         (void) getRecordTimestamp(record, &recoveryStopTime);
5939                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5940
5941                         ereport(LOG,
5942                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
5943                                                         recoveryStopName,
5944                                                         timestamptz_to_str(recoveryStopTime))));
5945                         return true;
5946                 }
5947         }
5948
5949         /* Check if the target LSN has been reached */
5950         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5951                 recoveryTargetInclusive &&
5952                 record->ReadRecPtr >= recoveryTargetLSN)
5953         {
5954                 recoveryStopAfter = true;
5955                 recoveryStopXid = InvalidTransactionId;
5956                 recoveryStopLSN = record->ReadRecPtr;
5957                 recoveryStopTime = 0;
5958                 recoveryStopName[0] = '\0';
5959                 ereport(LOG,
5960                                 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5961                                                 (uint32) (recoveryStopLSN >> 32),
5962                                                 (uint32) recoveryStopLSN)));
5963                 return true;
5964         }
5965
5966         if (rmid != RM_XACT_ID)
5967                 return false;
5968
5969         xact_info = info & XLOG_XACT_OPMASK;
5970
5971         if (xact_info == XLOG_XACT_COMMIT ||
5972                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5973                 xact_info == XLOG_XACT_ABORT ||
5974                 xact_info == XLOG_XACT_ABORT_PREPARED)
5975         {
5976                 TransactionId recordXid;
5977
5978                 /* Update the last applied transaction timestamp */
5979                 if (getRecordTimestamp(record, &recordXtime))
5980                         SetLatestXTime(recordXtime);
5981
5982                 /* Extract the XID of the committed/aborted transaction */
5983                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5984                 {
5985                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5986                         xl_xact_parsed_commit parsed;
5987
5988                         ParseCommitRecord(XLogRecGetInfo(record),
5989                                                           xlrec,
5990                                                           &parsed);
5991                         recordXid = parsed.twophase_xid;
5992                 }
5993                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5994                 {
5995                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5996                         xl_xact_parsed_abort parsed;
5997
5998                         ParseAbortRecord(XLogRecGetInfo(record),
5999                                                          xlrec,
6000                                                          &parsed);
6001                         recordXid = parsed.twophase_xid;
6002                 }
6003                 else
6004                         recordXid = XLogRecGetXid(record);
6005
6006                 /*
6007                  * There can be only one transaction end record with this exact
6008                  * transactionid
6009                  *
6010                  * when testing for an xid, we MUST test for equality only, since
6011                  * transactions are numbered in the order they start, not the order
6012                  * they complete. A higher numbered xid will complete before you about
6013                  * 50% of the time...
6014                  */
6015                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
6016                         recordXid == recoveryTargetXid)
6017                 {
6018                         recoveryStopAfter = true;
6019                         recoveryStopXid = recordXid;
6020                         recoveryStopTime = recordXtime;
6021                         recoveryStopLSN = InvalidXLogRecPtr;
6022                         recoveryStopName[0] = '\0';
6023
6024                         if (xact_info == XLOG_XACT_COMMIT ||
6025                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
6026                         {
6027                                 ereport(LOG,
6028                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
6029                                                                 recoveryStopXid,
6030                                                                 timestamptz_to_str(recoveryStopTime))));
6031                         }
6032                         else if (xact_info == XLOG_XACT_ABORT ||
6033                                          xact_info == XLOG_XACT_ABORT_PREPARED)
6034                         {
6035                                 ereport(LOG,
6036                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
6037                                                                 recoveryStopXid,
6038                                                                 timestamptz_to_str(recoveryStopTime))));
6039                         }
6040                         return true;
6041                 }
6042         }
6043
6044         /* Check if we should stop as soon as reaching consistency */
6045         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6046         {
6047                 ereport(LOG,
6048                                 (errmsg("recovery stopping after reaching consistency")));
6049
6050                 recoveryStopAfter = true;
6051                 recoveryStopXid = InvalidTransactionId;
6052                 recoveryStopTime = 0;
6053                 recoveryStopLSN = InvalidXLogRecPtr;
6054                 recoveryStopName[0] = '\0';
6055                 return true;
6056         }
6057
6058         return false;
6059 }
6060
6061 /*
6062  * Wait until shared recoveryPause flag is cleared.
6063  *
6064  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6065  * Probably not worth the trouble though.  This state shouldn't be one that
6066  * anyone cares about server power consumption in.
6067  */
6068 static void
6069 recoveryPausesHere(void)
6070 {
6071         /* Don't pause unless users can connect! */
6072         if (!LocalHotStandbyActive)
6073                 return;
6074
6075         ereport(LOG,
6076                         (errmsg("recovery has paused"),
6077                          errhint("Execute pg_wal_replay_resume() to continue.")));
6078
6079         while (RecoveryIsPaused())
6080         {
6081                 pg_usleep(1000000L);    /* 1000 ms */
6082                 HandleStartupProcInterrupts();
6083         }
6084 }
6085
6086 bool
6087 RecoveryIsPaused(void)
6088 {
6089         bool            recoveryPause;
6090
6091         SpinLockAcquire(&XLogCtl->info_lck);
6092         recoveryPause = XLogCtl->recoveryPause;
6093         SpinLockRelease(&XLogCtl->info_lck);
6094
6095         return recoveryPause;
6096 }
6097
6098 void
6099 SetRecoveryPause(bool recoveryPause)
6100 {
6101         SpinLockAcquire(&XLogCtl->info_lck);
6102         XLogCtl->recoveryPause = recoveryPause;
6103         SpinLockRelease(&XLogCtl->info_lck);
6104 }
6105
6106 /*
6107  * When recovery_min_apply_delay is set, we wait long enough to make sure
6108  * certain record types are applied at least that interval behind the master.
6109  *
6110  * Returns true if we waited.
6111  *
6112  * Note that the delay is calculated between the WAL record log time and
6113  * the current time on standby. We would prefer to keep track of when this
6114  * standby received each WAL record, which would allow a more consistent
6115  * approach and one not affected by time synchronisation issues, but that
6116  * is significantly more effort and complexity for little actual gain in
6117  * usability.
6118  */
6119 static bool
6120 recoveryApplyDelay(XLogReaderState *record)
6121 {
6122         uint8           xact_info;
6123         TimestampTz xtime;
6124         long            secs;
6125         int                     microsecs;
6126
6127         /* nothing to do if no delay configured */
6128         if (recovery_min_apply_delay <= 0)
6129                 return false;
6130
6131         /* no delay is applied on a database not yet consistent */
6132         if (!reachedConsistency)
6133                 return false;
6134
6135         /*
6136          * Is it a COMMIT record?
6137          *
6138          * We deliberately choose not to delay aborts since they have no effect on
6139          * MVCC. We already allow replay of records that don't have a timestamp,
6140          * so there is already opportunity for issues caused by early conflicts on
6141          * standbys.
6142          */
6143         if (XLogRecGetRmid(record) != RM_XACT_ID)
6144                 return false;
6145
6146         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6147
6148         if (xact_info != XLOG_XACT_COMMIT &&
6149                 xact_info != XLOG_XACT_COMMIT_PREPARED)
6150                 return false;
6151
6152         if (!getRecordTimestamp(record, &xtime))
6153                 return false;
6154
6155         recoveryDelayUntilTime =
6156                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6157
6158         /*
6159          * Exit without arming the latch if it's already past time to apply this
6160          * record
6161          */
6162         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6163                                                 &secs, &microsecs);
6164         if (secs <= 0 && microsecs <= 0)
6165                 return false;
6166
6167         while (true)
6168         {
6169                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6170
6171                 /* might change the trigger file's location */
6172                 HandleStartupProcInterrupts();
6173
6174                 if (CheckForStandbyTrigger())
6175                         break;
6176
6177                 /*
6178                  * Wait for difference between GetCurrentTimestamp() and
6179                  * recoveryDelayUntilTime
6180                  */
6181                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6182                                                         &secs, &microsecs);
6183
6184                 /* NB: We're ignoring waits below min_apply_delay's resolution. */
6185                 if (secs <= 0 && microsecs / 1000 <= 0)
6186                         break;
6187
6188                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6189                          secs, microsecs / 1000);
6190
6191                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6192                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6193                                   secs * 1000L + microsecs / 1000,
6194                                   WAIT_EVENT_RECOVERY_APPLY_DELAY);
6195         }
6196         return true;
6197 }
6198
6199 /*
6200  * Save timestamp of latest processed commit/abort record.
6201  *
6202  * We keep this in XLogCtl, not a simple static variable, so that it can be
6203  * seen by processes other than the startup process.  Note in particular
6204  * that CreateRestartPoint is executed in the checkpointer.
6205  */
6206 static void
6207 SetLatestXTime(TimestampTz xtime)
6208 {
6209         SpinLockAcquire(&XLogCtl->info_lck);
6210         XLogCtl->recoveryLastXTime = xtime;
6211         SpinLockRelease(&XLogCtl->info_lck);
6212 }
6213
6214 /*
6215  * Fetch timestamp of latest processed commit/abort record.
6216  */
6217 TimestampTz
6218 GetLatestXTime(void)
6219 {
6220         TimestampTz xtime;
6221
6222         SpinLockAcquire(&XLogCtl->info_lck);
6223         xtime = XLogCtl->recoveryLastXTime;
6224         SpinLockRelease(&XLogCtl->info_lck);
6225
6226         return xtime;
6227 }
6228
6229 /*
6230  * Save timestamp of the next chunk of WAL records to apply.
6231  *
6232  * We keep this in XLogCtl, not a simple static variable, so that it can be
6233  * seen by all backends.
6234  */
6235 static void
6236 SetCurrentChunkStartTime(TimestampTz xtime)
6237 {
6238         SpinLockAcquire(&XLogCtl->info_lck);
6239         XLogCtl->currentChunkStartTime = xtime;
6240         SpinLockRelease(&XLogCtl->info_lck);
6241 }
6242
6243 /*
6244  * Fetch timestamp of latest processed commit/abort record.
6245  * Startup process maintains an accurate local copy in XLogReceiptTime
6246  */
6247 TimestampTz
6248 GetCurrentChunkReplayStartTime(void)
6249 {
6250         TimestampTz xtime;
6251
6252         SpinLockAcquire(&XLogCtl->info_lck);
6253         xtime = XLogCtl->currentChunkStartTime;
6254         SpinLockRelease(&XLogCtl->info_lck);
6255
6256         return xtime;
6257 }
6258
6259 /*
6260  * Returns time of receipt of current chunk of XLOG data, as well as
6261  * whether it was received from streaming replication or from archives.
6262  */
6263 void
6264 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6265 {
6266         /*
6267          * This must be executed in the startup process, since we don't export the
6268          * relevant state to shared memory.
6269          */
6270         Assert(InRecovery);
6271
6272         *rtime = XLogReceiptTime;
6273         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6274 }
6275
6276 /*
6277  * Note that text field supplied is a parameter name and does not require
6278  * translation
6279  */
6280 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6281 do { \
6282         if ((currValue) < (minValue)) \
6283                 ereport(ERROR, \
6284                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6285                                  errmsg("hot standby is not possible because " \
6286                                                 "%s = %d is a lower setting than on the master server " \
6287                                                 "(its value was %d)", \
6288                                                 param_name, \
6289                                                 currValue, \
6290                                                 minValue))); \
6291 } while(0)
6292
6293 /*
6294  * Check to see if required parameters are set high enough on this server
6295  * for various aspects of recovery operation.
6296  *
6297  * Note that all the parameters which this function tests need to be
6298  * listed in Administrator's Overview section in high-availability.sgml.
6299  * If you change them, don't forget to update the list.
6300  */
6301 static void
6302 CheckRequiredParameterValues(void)
6303 {
6304         /*
6305          * For archive recovery, the WAL must be generated with at least 'replica'
6306          * wal_level.
6307          */
6308         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6309         {
6310                 ereport(WARNING,
6311                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6312                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6313         }
6314
6315         /*
6316          * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6317          * must have at least as many backend slots as the primary.
6318          */
6319         if (ArchiveRecoveryRequested && EnableHotStandby)
6320         {
6321                 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6322                         ereport(ERROR,
6323                                         (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6324                                          errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6325
6326                 /* We ignore autovacuum_max_workers when we make this test. */
6327                 RecoveryRequiresIntParameter("max_connections",
6328                                                                          MaxConnections,
6329                                                                          ControlFile->MaxConnections);
6330                 RecoveryRequiresIntParameter("max_worker_processes",
6331                                                                          max_worker_processes,
6332                                                                          ControlFile->max_worker_processes);
6333                 RecoveryRequiresIntParameter("max_prepared_transactions",
6334                                                                          max_prepared_xacts,
6335                                                                          ControlFile->max_prepared_xacts);
6336                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6337                                                                          max_locks_per_xact,
6338                                                                          ControlFile->max_locks_per_xact);
6339         }
6340 }
6341
6342 /*
6343  * This must be called ONCE during postmaster or standalone-backend startup
6344  */
6345 void
6346 StartupXLOG(void)
6347 {
6348         XLogCtlInsert *Insert;
6349         CheckPoint      checkPoint;
6350         bool            wasShutdown;
6351         bool            reachedStopPoint = false;
6352         bool            haveBackupLabel = false;
6353         bool            haveTblspcMap = false;
6354         XLogRecPtr      RecPtr,
6355                                 checkPointLoc,
6356                                 EndOfLog;
6357         TimeLineID      EndOfLogTLI;
6358         TimeLineID      PrevTimeLineID;
6359         XLogRecord *record;
6360         TransactionId oldestActiveXID;
6361         bool            backupEndRequired = false;
6362         bool            backupFromStandby = false;
6363         DBState         dbstate_at_startup;
6364         XLogReaderState *xlogreader;
6365         XLogPageReadPrivate private;
6366         bool            fast_promoted = false;
6367         struct stat st;
6368
6369         /*
6370          * We should have an aux process resource owner to use, and we should not
6371          * be in a transaction that's installed some other resowner.
6372          */
6373         Assert(AuxProcessResourceOwner != NULL);
6374         Assert(CurrentResourceOwner == NULL ||
6375                    CurrentResourceOwner == AuxProcessResourceOwner);
6376         CurrentResourceOwner = AuxProcessResourceOwner;
6377
6378         /*
6379          * Verify XLOG status looks valid.
6380          */
6381         if (ControlFile->state < DB_SHUTDOWNED ||
6382                 ControlFile->state > DB_IN_PRODUCTION ||
6383                 !XRecOffIsValid(ControlFile->checkPoint))
6384                 ereport(FATAL,
6385                                 (errmsg("control file contains invalid data")));
6386
6387         if (ControlFile->state == DB_SHUTDOWNED)
6388         {
6389                 /* This is the expected case, so don't be chatty in standalone mode */
6390                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6391                                 (errmsg("database system was shut down at %s",
6392                                                 str_time(ControlFile->time))));
6393         }
6394         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6395                 ereport(LOG,
6396                                 (errmsg("database system was shut down in recovery at %s",
6397                                                 str_time(ControlFile->time))));
6398         else if (ControlFile->state == DB_SHUTDOWNING)
6399                 ereport(LOG,
6400                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6401                                                 str_time(ControlFile->time))));
6402         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6403                 ereport(LOG,
6404                                 (errmsg("database system was interrupted while in recovery at %s",
6405                                                 str_time(ControlFile->time)),
6406                                  errhint("This probably means that some data is corrupted and"
6407                                                  " you will have to use the last backup for recovery.")));
6408         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6409                 ereport(LOG,
6410                                 (errmsg("database system was interrupted while in recovery at log time %s",
6411                                                 str_time(ControlFile->checkPointCopy.time)),
6412                                  errhint("If this has occurred more than once some data might be corrupted"
6413                                                  " and you might need to choose an earlier recovery target.")));
6414         else if (ControlFile->state == DB_IN_PRODUCTION)
6415                 ereport(LOG,
6416                                 (errmsg("database system was interrupted; last known up at %s",
6417                                                 str_time(ControlFile->time))));
6418
6419         /* This is just to allow attaching to startup process with a debugger */
6420 #ifdef XLOG_REPLAY_DELAY
6421         if (ControlFile->state != DB_SHUTDOWNED)
6422                 pg_usleep(60000000L);
6423 #endif
6424
6425         /*
6426          * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6427          * someone has performed a copy for PITR, these directories may have been
6428          * excluded and need to be re-created.
6429          */
6430         ValidateXLOGDirectoryStructure();
6431
6432         /*----------
6433          * If we previously crashed, perform a couple of actions:
6434          *      - The pg_wal directory may still include some temporary WAL segments
6435          * used when creating a new segment, so perform some clean up to not
6436          * bloat this path.  This is done first as there is no point to sync this
6437          * temporary data.
6438          *      - There might be data which we had written, intending to fsync it,
6439          * but which we had not actually fsync'd yet. Therefore, a power failure
6440          * in the near future might cause earlier unflushed writes to be lost,
6441          * even though more recent data written to disk from here on would be
6442          * persisted.  To avoid that, fsync the entire data directory.
6443          *---------
6444          */
6445         if (ControlFile->state != DB_SHUTDOWNED &&
6446                 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6447         {
6448                 RemoveTempXlogFiles();
6449                 SyncDataDirectory();
6450         }
6451
6452         /*
6453          * Initialize on the assumption we want to recover to the latest timeline
6454          * that's active according to pg_control.
6455          */
6456         if (ControlFile->minRecoveryPointTLI >
6457                 ControlFile->checkPointCopy.ThisTimeLineID)
6458                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6459         else
6460                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6461
6462         /*
6463          * Check for recovery control file, and if so set up state for offline
6464          * recovery
6465          */
6466         readRecoveryCommandFile();
6467
6468         /*
6469          * Save archive_cleanup_command in shared memory so that other processes
6470          * can see it.
6471          */
6472         strlcpy(XLogCtl->archiveCleanupCommand,
6473                         archiveCleanupCommand ? archiveCleanupCommand : "",
6474                         sizeof(XLogCtl->archiveCleanupCommand));
6475
6476         if (ArchiveRecoveryRequested)
6477         {
6478                 if (StandbyModeRequested)
6479                         ereport(LOG,
6480                                         (errmsg("entering standby mode")));
6481                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6482                         ereport(LOG,
6483                                         (errmsg("starting point-in-time recovery to XID %u",
6484                                                         recoveryTargetXid)));
6485                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6486                         ereport(LOG,
6487                                         (errmsg("starting point-in-time recovery to %s",
6488                                                         timestamptz_to_str(recoveryTargetTime))));
6489                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6490                         ereport(LOG,
6491                                         (errmsg("starting point-in-time recovery to \"%s\"",
6492                                                         recoveryTargetName)));
6493                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6494                         ereport(LOG,
6495                                         (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6496                                                         (uint32) (recoveryTargetLSN >> 32),
6497                                                         (uint32) recoveryTargetLSN)));
6498                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6499                         ereport(LOG,
6500                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6501                 else
6502                         ereport(LOG,
6503                                         (errmsg("starting archive recovery")));
6504         }
6505
6506         /*
6507          * Take ownership of the wakeup latch if we're going to sleep during
6508          * recovery.
6509          */
6510         if (StandbyModeRequested)
6511                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6512
6513         /* Set up XLOG reader facility */
6514         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6515         xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
6516         if (!xlogreader)
6517                 ereport(ERROR,
6518                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6519                                  errmsg("out of memory"),
6520                                  errdetail("Failed while allocating a WAL reading processor.")));
6521         xlogreader->system_identifier = ControlFile->system_identifier;
6522
6523         /*
6524          * Allocate pages dedicated to WAL consistency checks, those had better be
6525          * aligned.
6526          */
6527         replay_image_masked = (char *) palloc(BLCKSZ);
6528         master_image_masked = (char *) palloc(BLCKSZ);
6529
6530         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6531                                                   &backupFromStandby))
6532         {
6533                 List       *tablespaces = NIL;
6534
6535                 /*
6536                  * Archive recovery was requested, and thanks to the backup label
6537                  * file, we know how far we need to replay to reach consistency. Enter
6538                  * archive recovery directly.
6539                  */
6540                 InArchiveRecovery = true;
6541                 if (StandbyModeRequested)
6542                         StandbyMode = true;
6543
6544                 /*
6545                  * When a backup_label file is present, we want to roll forward from
6546                  * the checkpoint it identifies, rather than using pg_control.
6547                  */
6548                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6549                 if (record != NULL)
6550                 {
6551                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6552                         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6553                         ereport(DEBUG1,
6554                                         (errmsg("checkpoint record is at %X/%X",
6555                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6556                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6557
6558                         /*
6559                          * Make sure that REDO location exists. This may not be the case
6560                          * if there was a crash during an online backup, which left a
6561                          * backup_label around that references a WAL segment that's
6562                          * already been archived.
6563                          */
6564                         if (checkPoint.redo < checkPointLoc)
6565                         {
6566                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6567                                         ereport(FATAL,
6568                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6569                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6570                         }
6571                 }
6572                 else
6573                 {
6574                         ereport(FATAL,
6575                                         (errmsg("could not locate required checkpoint record"),
6576                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6577                         wasShutdown = false;    /* keep compiler quiet */
6578                 }
6579
6580                 /* read the tablespace_map file if present and create symlinks. */
6581                 if (read_tablespace_map(&tablespaces))
6582                 {
6583                         ListCell   *lc;
6584
6585                         foreach(lc, tablespaces)
6586                         {
6587                                 tablespaceinfo *ti = lfirst(lc);
6588                                 char       *linkloc;
6589
6590                                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6591
6592                                 /*
6593                                  * Remove the existing symlink if any and Create the symlink
6594                                  * under PGDATA.
6595                                  */
6596                                 remove_tablespace_symlink(linkloc);
6597
6598                                 if (symlink(ti->path, linkloc) < 0)
6599                                         ereport(ERROR,
6600                                                         (errcode_for_file_access(),
6601                                                          errmsg("could not create symbolic link \"%s\": %m",
6602                                                                         linkloc)));
6603
6604                                 pfree(ti->oid);
6605                                 pfree(ti->path);
6606                                 pfree(ti);
6607                         }
6608
6609                         /* set flag to delete it later */
6610                         haveTblspcMap = true;
6611                 }
6612
6613                 /* set flag to delete it later */
6614                 haveBackupLabel = true;
6615         }
6616         else
6617         {
6618                 /*
6619                  * If tablespace_map file is present without backup_label file, there
6620                  * is no use of such file.  There is no harm in retaining it, but it
6621                  * is better to get rid of the map file so that we don't have any
6622                  * redundant file in data directory and it will avoid any sort of
6623                  * confusion.  It seems prudent though to just rename the file out of
6624                  * the way rather than delete it completely, also we ignore any error
6625                  * that occurs in rename operation as even if map file is present
6626                  * without backup_label file, it is harmless.
6627                  */
6628                 if (stat(TABLESPACE_MAP, &st) == 0)
6629                 {
6630                         unlink(TABLESPACE_MAP_OLD);
6631                         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6632                                 ereport(LOG,
6633                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6634                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6635                                                  errdetail("File \"%s\" was renamed to \"%s\".",
6636                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6637                         else
6638                                 ereport(LOG,
6639                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6640                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6641                                                  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6642                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6643                 }
6644
6645                 /*
6646                  * It's possible that archive recovery was requested, but we don't
6647                  * know how far we need to replay the WAL before we reach consistency.
6648                  * This can happen for example if a base backup is taken from a
6649                  * running server using an atomic filesystem snapshot, without calling
6650                  * pg_start/stop_backup. Or if you just kill a running master server
6651                  * and put it into archive recovery by creating a recovery.conf file.
6652                  *
6653                  * Our strategy in that case is to perform crash recovery first,
6654                  * replaying all the WAL present in pg_wal, and only enter archive
6655                  * recovery after that.
6656                  *
6657                  * But usually we already know how far we need to replay the WAL (up
6658                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6659                  * end-of-backup record), and we can enter archive recovery directly.
6660                  */
6661                 if (ArchiveRecoveryRequested &&
6662                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6663                          ControlFile->backupEndRequired ||
6664                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6665                          ControlFile->state == DB_SHUTDOWNED))
6666                 {
6667                         InArchiveRecovery = true;
6668                         if (StandbyModeRequested)
6669                                 StandbyMode = true;
6670                 }
6671
6672                 /* Get the last valid checkpoint record. */
6673                 checkPointLoc = ControlFile->checkPoint;
6674                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6675                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6676                 if (record != NULL)
6677                 {
6678                         ereport(DEBUG1,
6679                                         (errmsg("checkpoint record is at %X/%X",
6680                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6681                 }
6682                 else
6683                 {
6684                         /*
6685                          * We used to attempt to go back to a secondary checkpoint record
6686                          * here, but only when not in standby_mode. We now just fail if we
6687                          * can't read the last checkpoint because this allows us to
6688                          * simplify processing around checkpoints.
6689                          */
6690                         ereport(PANIC,
6691                                         (errmsg("could not locate a valid checkpoint record")));
6692                 }
6693                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6694                 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6695         }
6696
6697         /*
6698          * Clear out any old relcache cache files.  This is *necessary* if we do
6699          * any WAL replay, since that would probably result in the cache files
6700          * being out of sync with database reality.  In theory we could leave them
6701          * in place if the database had been cleanly shut down, but it seems
6702          * safest to just remove them always and let them be rebuilt during the
6703          * first backend startup.  These files needs to be removed from all
6704          * directories including pg_tblspc, however the symlinks are created only
6705          * after reading tablespace_map file in case of archive recovery from
6706          * backup, so needs to clear old relcache files here after creating
6707          * symlinks.
6708          */
6709         RelationCacheInitFileRemove();
6710
6711         /*
6712          * If the location of the checkpoint record is not on the expected
6713          * timeline in the history of the requested timeline, we cannot proceed:
6714          * the backup is not part of the history of the requested timeline.
6715          */
6716         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6717                                                                  * record */
6718         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6719                 checkPoint.ThisTimeLineID)
6720         {
6721                 XLogRecPtr      switchpoint;
6722
6723                 /*
6724                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6725                  * not in expectedTLEs at all.
6726                  */
6727                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6728                 ereport(FATAL,
6729                                 (errmsg("requested timeline %u is not a child of this server's history",
6730                                                 recoveryTargetTLI),
6731                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6732                                                    (uint32) (ControlFile->checkPoint >> 32),
6733                                                    (uint32) ControlFile->checkPoint,
6734                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6735                                                    (uint32) (switchpoint >> 32),
6736                                                    (uint32) switchpoint)));
6737         }
6738
6739         /*
6740          * The min recovery point should be part of the requested timeline's
6741          * history, too.
6742          */
6743         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6744                 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6745                 ControlFile->minRecoveryPointTLI)
6746                 ereport(FATAL,
6747                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6748                                                 recoveryTargetTLI,
6749                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6750                                                 (uint32) ControlFile->minRecoveryPoint,
6751                                                 ControlFile->minRecoveryPointTLI)));
6752
6753         LastRec = RecPtr = checkPointLoc;
6754
6755         ereport(DEBUG1,
6756                         (errmsg_internal("redo record is at %X/%X; shutdown %s",
6757                                                          (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6758                                                          wasShutdown ? "true" : "false")));
6759         ereport(DEBUG1,
6760                         (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6761                                                          checkPoint.nextXidEpoch, checkPoint.nextXid,
6762                                                          checkPoint.nextOid)));
6763         ereport(DEBUG1,
6764                         (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6765                                                          checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6766         ereport(DEBUG1,
6767                         (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6768                                                          checkPoint.oldestXid, checkPoint.oldestXidDB)));
6769         ereport(DEBUG1,
6770                         (errmsg_internal("oldest MultiXactId: %u, in database %u",
6771                                                          checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6772         ereport(DEBUG1,
6773                         (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6774                                                          checkPoint.oldestCommitTsXid,
6775                                                          checkPoint.newestCommitTsXid)));
6776         if (!TransactionIdIsNormal(checkPoint.nextXid))
6777                 ereport(PANIC,
6778                                 (errmsg("invalid next transaction ID")));
6779
6780         /* initialize shared memory variables from the checkpoint record */
6781         ShmemVariableCache->nextXid = checkPoint.nextXid;
6782         ShmemVariableCache->nextOid = checkPoint.nextOid;
6783         ShmemVariableCache->oidCount = 0;
6784         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6785         AdvanceOldestClogXid(checkPoint.oldestXid);
6786         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6787         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6788         SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6789                                          checkPoint.newestCommitTsXid);
6790         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6791         XLogCtl->ckptXid = checkPoint.nextXid;
6792
6793         /*
6794          * Initialize replication slots, before there's a chance to remove
6795          * required resources.
6796          */
6797         StartupReplicationSlots();
6798
6799         /*
6800          * Startup logical state, needs to be setup now so we have proper data
6801          * during crash recovery.
6802          */
6803         StartupReorderBuffer();
6804
6805         /*
6806          * Startup MultiXact. We need to do this early to be able to replay
6807          * truncations.
6808          */
6809         StartupMultiXact();
6810
6811         /*
6812          * Ditto commit timestamps.  In a standby, we do it if setting is enabled
6813          * in ControlFile; in a master we base the decision on the GUC itself.
6814          */
6815         if (ArchiveRecoveryRequested ?
6816                 ControlFile->track_commit_timestamp : track_commit_timestamp)
6817                 StartupCommitTs();
6818
6819         /*
6820          * Recover knowledge about replay progress of known replication partners.
6821          */
6822         StartupReplicationOrigin();
6823
6824         /*
6825          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6826          * control file. On recovery, all unlogged relations are blown away, so
6827          * the unlogged LSN counter can be reset too.
6828          */
6829         if (ControlFile->state == DB_SHUTDOWNED)
6830                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6831         else
6832                 XLogCtl->unloggedLSN = 1;
6833
6834         /*
6835          * We must replay WAL entries using the same TimeLineID they were created
6836          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6837          * also xlog_redo()).
6838          */
6839         ThisTimeLineID = checkPoint.ThisTimeLineID;
6840
6841         /*
6842          * Copy any missing timeline history files between 'now' and the recovery
6843          * target timeline from archive to pg_wal. While we don't need those files
6844          * ourselves - the history file of the recovery target timeline covers all
6845          * the previous timelines in the history too - a cascading standby server
6846          * might be interested in them. Or, if you archive the WAL from this
6847          * server to a different archive than the master, it'd be good for all the
6848          * history files to get archived there after failover, so that you can use
6849          * one of the old timelines as a PITR target. Timeline history files are
6850          * small, so it's better to copy them unnecessarily than not copy them and
6851          * regret later.
6852          */
6853         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6854
6855         /*
6856          * Before running in recovery, scan pg_twophase and fill in its status to
6857          * be able to work on entries generated by redo.  Doing a scan before
6858          * taking any recovery action has the merit to discard any 2PC files that
6859          * are newer than the first record to replay, saving from any conflicts at
6860          * replay.  This avoids as well any subsequent scans when doing recovery
6861          * of the on-disk two-phase data.
6862          */
6863         restoreTwoPhaseData();
6864
6865         lastFullPageWrites = checkPoint.fullPageWrites;
6866
6867         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6868         doPageWrites = lastFullPageWrites;
6869
6870         if (RecPtr < checkPoint.redo)
6871                 ereport(PANIC,
6872                                 (errmsg("invalid redo in checkpoint record")));
6873
6874         /*
6875          * Check whether we need to force recovery from WAL.  If it appears to
6876          * have been a clean shutdown and we did not have a recovery.conf file,
6877          * then assume no recovery needed.
6878          */
6879         if (checkPoint.redo < RecPtr)
6880         {
6881                 if (wasShutdown)
6882                         ereport(PANIC,
6883                                         (errmsg("invalid redo record in shutdown checkpoint")));
6884                 InRecovery = true;
6885         }
6886         else if (ControlFile->state != DB_SHUTDOWNED)
6887                 InRecovery = true;
6888         else if (ArchiveRecoveryRequested)
6889         {
6890                 /* force recovery due to presence of recovery.conf */
6891                 InRecovery = true;
6892         }
6893
6894         /* REDO */
6895         if (InRecovery)
6896         {
6897                 int                     rmid;
6898
6899                 /*
6900                  * Update pg_control to show that we are recovering and to show the
6901                  * selected checkpoint as the place we are starting from. We also mark
6902                  * pg_control with any minimum recovery stop point obtained from a
6903                  * backup history file.
6904                  */
6905                 dbstate_at_startup = ControlFile->state;
6906                 if (InArchiveRecovery)
6907                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6908                 else
6909                 {
6910                         ereport(LOG,
6911                                         (errmsg("database system was not properly shut down; "
6912                                                         "automatic recovery in progress")));
6913                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6914                                 ereport(LOG,
6915                                                 (errmsg("crash recovery starts in timeline %u "
6916                                                                 "and has target timeline %u",
6917                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6918                                                                 recoveryTargetTLI)));
6919                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6920                 }
6921                 ControlFile->checkPoint = checkPointLoc;
6922                 ControlFile->checkPointCopy = checkPoint;
6923                 if (InArchiveRecovery)
6924                 {
6925                         /* initialize minRecoveryPoint if not set yet */
6926                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6927                         {
6928                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6929                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6930                         }
6931                 }
6932
6933                 /*
6934                  * Set backupStartPoint if we're starting recovery from a base backup.
6935                  *
6936                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
6937                  * location if we're starting recovery from a base backup which was
6938                  * taken from a standby. In this case, the database system status in
6939                  * pg_control must indicate that the database was already in recovery.
6940                  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6941                  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6942                  * before reaching this point; e.g. because restore_command or
6943                  * primary_conninfo were faulty.
6944                  *
6945                  * Any other state indicates that the backup somehow became corrupted
6946                  * and we can't sensibly continue with recovery.
6947                  */
6948                 if (haveBackupLabel)
6949                 {
6950                         ControlFile->backupStartPoint = checkPoint.redo;
6951                         ControlFile->backupEndRequired = backupEndRequired;
6952
6953                         if (backupFromStandby)
6954                         {
6955                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6956                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6957                                         ereport(FATAL,
6958                                                         (errmsg("backup_label contains data inconsistent with control file"),
6959                                                          errhint("This means that the backup is corrupted and you will "
6960                                                                          "have to use another backup for recovery.")));
6961                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6962                         }
6963                 }
6964                 ControlFile->time = (pg_time_t) time(NULL);
6965                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6966                 UpdateControlFile();
6967
6968                 /*
6969                  * Initialize our local copy of minRecoveryPoint.  When doing crash
6970                  * recovery we want to replay up to the end of WAL.  Particularly, in
6971                  * the case of a promoted standby minRecoveryPoint value in the
6972                  * control file is only updated after the first checkpoint.  However,
6973                  * if the instance crashes before the first post-recovery checkpoint
6974                  * is completed then recovery will use a stale location causing the
6975                  * startup process to think that there are still invalid page
6976                  * references when checking for data consistency.
6977                  */
6978                 if (InArchiveRecovery)
6979                 {
6980                         minRecoveryPoint = ControlFile->minRecoveryPoint;
6981                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6982                 }
6983                 else
6984                 {
6985                         minRecoveryPoint = InvalidXLogRecPtr;
6986                         minRecoveryPointTLI = 0;
6987                 }
6988
6989                 /*
6990                  * Reset pgstat data, because it may be invalid after recovery.
6991                  */
6992                 pgstat_reset_all();
6993
6994                 /*
6995                  * If there was a backup label file, it's done its job and the info
6996                  * has now been propagated into pg_control.  We must get rid of the
6997                  * label file so that if we crash during recovery, we'll pick up at
6998                  * the latest recovery restartpoint instead of going all the way back
6999                  * to the backup start point.  It seems prudent though to just rename
7000                  * the file out of the way rather than delete it completely.
7001                  */
7002                 if (haveBackupLabel)
7003                 {
7004                         unlink(BACKUP_LABEL_OLD);
7005                         durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7006                 }
7007
7008                 /*
7009                  * If there was a tablespace_map file, it's done its job and the
7010                  * symlinks have been created.  We must get rid of the map file so
7011                  * that if we crash during recovery, we don't create symlinks again.
7012                  * It seems prudent though to just rename the file out of the way
7013                  * rather than delete it completely.
7014                  */
7015                 if (haveTblspcMap)
7016                 {
7017                         unlink(TABLESPACE_MAP_OLD);
7018                         durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7019                 }
7020
7021                 /* Check that the GUCs used to generate the WAL allow recovery */
7022                 CheckRequiredParameterValues();
7023
7024                 /*
7025                  * We're in recovery, so unlogged relations may be trashed and must be
7026                  * reset.  This should be done BEFORE allowing Hot Standby
7027                  * connections, so that read-only backends don't try to read whatever
7028                  * garbage is left over from before.
7029                  */
7030                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7031
7032                 /*
7033                  * Likewise, delete any saved transaction snapshot files that got left
7034                  * behind by crashed backends.
7035                  */
7036                 DeleteAllExportedSnapshotFiles();
7037
7038                 /*
7039                  * Initialize for Hot Standby, if enabled. We won't let backends in
7040                  * yet, not until we've reached the min recovery point specified in
7041                  * control file and we've established a recovery snapshot from a
7042                  * running-xacts WAL record.
7043                  */
7044                 if (ArchiveRecoveryRequested && EnableHotStandby)
7045                 {
7046                         TransactionId *xids;
7047                         int                     nxids;
7048
7049                         ereport(DEBUG1,
7050                                         (errmsg("initializing for hot standby")));
7051
7052                         InitRecoveryTransactionEnvironment();
7053
7054                         if (wasShutdown)
7055                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7056                         else
7057                                 oldestActiveXID = checkPoint.oldestActiveXid;
7058                         Assert(TransactionIdIsValid(oldestActiveXID));
7059
7060                         /* Tell procarray about the range of xids it has to deal with */
7061                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
7062
7063                         /*
7064                          * Startup commit log and subtrans only.  MultiXact and commit
7065                          * timestamp have already been started up and other SLRUs are not
7066                          * maintained during recovery and need not be started yet.
7067                          */
7068                         StartupCLOG();
7069                         StartupSUBTRANS(oldestActiveXID);
7070
7071                         /*
7072                          * If we're beginning at a shutdown checkpoint, we know that
7073                          * nothing was running on the master at this point. So fake-up an
7074                          * empty running-xacts record and use that here and now. Recover
7075                          * additional standby state for prepared transactions.
7076                          */
7077                         if (wasShutdown)
7078                         {
7079                                 RunningTransactionsData running;
7080                                 TransactionId latestCompletedXid;
7081
7082                                 /*
7083                                  * Construct a RunningTransactions snapshot representing a
7084                                  * shut down server, with only prepared transactions still
7085                                  * alive. We're never overflowed at this point because all
7086                                  * subxids are listed with their parent prepared transactions.
7087                                  */
7088                                 running.xcnt = nxids;
7089                                 running.subxcnt = 0;
7090                                 running.subxid_overflow = false;
7091                                 running.nextXid = checkPoint.nextXid;
7092                                 running.oldestRunningXid = oldestActiveXID;
7093                                 latestCompletedXid = checkPoint.nextXid;
7094                                 TransactionIdRetreat(latestCompletedXid);
7095                                 Assert(TransactionIdIsNormal(latestCompletedXid));
7096                                 running.latestCompletedXid = latestCompletedXid;
7097                                 running.xids = xids;
7098
7099                                 ProcArrayApplyRecoveryInfo(&running);
7100
7101                                 StandbyRecoverPreparedTransactions();
7102                         }
7103                 }
7104
7105                 /* Initialize resource managers */
7106                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7107                 {
7108                         if (RmgrTable[rmid].rm_startup != NULL)
7109                                 RmgrTable[rmid].rm_startup();
7110                 }
7111
7112                 /*
7113                  * Initialize shared variables for tracking progress of WAL replay, as
7114                  * if we had just replayed the record before the REDO location (or the
7115                  * checkpoint record itself, if it's a shutdown checkpoint).
7116                  */
7117                 SpinLockAcquire(&XLogCtl->info_lck);
7118                 if (checkPoint.redo < RecPtr)
7119                         XLogCtl->replayEndRecPtr = checkPoint.redo;
7120                 else
7121                         XLogCtl->replayEndRecPtr = EndRecPtr;
7122                 XLogCtl->replayEndTLI = ThisTimeLineID;
7123                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7124                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7125                 XLogCtl->recoveryLastXTime = 0;
7126                 XLogCtl->currentChunkStartTime = 0;
7127                 XLogCtl->recoveryPause = false;
7128                 SpinLockRelease(&XLogCtl->info_lck);
7129
7130                 /* Also ensure XLogReceiptTime has a sane value */
7131                 XLogReceiptTime = GetCurrentTimestamp();
7132
7133                 /*
7134                  * Let postmaster know we've started redo now, so that it can launch
7135                  * checkpointer to perform restartpoints.  We don't bother during
7136                  * crash recovery as restartpoints can only be performed during
7137                  * archive recovery.  And we'd like to keep crash recovery simple, to
7138                  * avoid introducing bugs that could affect you when recovering after
7139                  * crash.
7140                  *
7141                  * After this point, we can no longer assume that we're the only
7142                  * process in addition to postmaster!  Also, fsync requests are
7143                  * subsequently to be handled by the checkpointer, not locally.
7144                  */
7145                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7146                 {
7147                         PublishStartupProcessInformation();
7148                         SetForwardFsyncRequests();
7149                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7150                         bgwriterLaunched = true;
7151                 }
7152
7153                 /*
7154                  * Allow read-only connections immediately if we're consistent
7155                  * already.
7156                  */
7157                 CheckRecoveryConsistency();
7158
7159                 /*
7160                  * Find the first record that logically follows the checkpoint --- it
7161                  * might physically precede it, though.
7162                  */
7163                 if (checkPoint.redo < RecPtr)
7164                 {
7165                         /* back up to find the record */
7166                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7167                 }
7168                 else
7169                 {
7170                         /* just have to read next record after CheckPoint */
7171                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7172                 }
7173
7174                 if (record != NULL)
7175                 {
7176                         ErrorContextCallback errcallback;
7177                         TimestampTz xtime;
7178
7179                         InRedo = true;
7180
7181                         ereport(LOG,
7182                                         (errmsg("redo starts at %X/%X",
7183                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7184
7185                         /*
7186                          * main redo apply loop
7187                          */
7188                         do
7189                         {
7190                                 bool            switchedTLI = false;
7191
7192 #ifdef WAL_DEBUG
7193                                 if (XLOG_DEBUG ||
7194                                         (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7195                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7196                                 {
7197                                         StringInfoData buf;
7198
7199                                         initStringInfo(&buf);
7200                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7201                                                                          (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7202                                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7203                                         xlog_outrec(&buf, xlogreader);
7204                                         appendStringInfoString(&buf, " - ");
7205                                         xlog_outdesc(&buf, xlogreader);
7206                                         elog(LOG, "%s", buf.data);
7207                                         pfree(buf.data);
7208                                 }
7209 #endif
7210
7211                                 /* Handle interrupt signals of startup process */
7212                                 HandleStartupProcInterrupts();
7213
7214                                 /*
7215                                  * Pause WAL replay, if requested by a hot-standby session via
7216                                  * SetRecoveryPause().
7217                                  *
7218                                  * Note that we intentionally don't take the info_lck spinlock
7219                                  * here.  We might therefore read a slightly stale value of
7220                                  * the recoveryPause flag, but it can't be very stale (no
7221                                  * worse than the last spinlock we did acquire).  Since a
7222                                  * pause request is a pretty asynchronous thing anyway,
7223                                  * possibly responding to it one WAL record later than we
7224                                  * otherwise would is a minor issue, so it doesn't seem worth
7225                                  * adding another spinlock cycle to prevent that.
7226                                  */
7227                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7228                                         recoveryPausesHere();
7229
7230                                 /*
7231                                  * Have we reached our recovery target?
7232                                  */
7233                                 if (recoveryStopsBefore(xlogreader))
7234                                 {
7235                                         reachedStopPoint = true;        /* see below */
7236                                         break;
7237                                 }
7238
7239                                 /*
7240                                  * If we've been asked to lag the master, wait on latch until
7241                                  * enough time has passed.
7242                                  */
7243                                 if (recoveryApplyDelay(xlogreader))
7244                                 {
7245                                         /*
7246                                          * We test for paused recovery again here. If user sets
7247                                          * delayed apply, it may be because they expect to pause
7248                                          * recovery in case of problems, so we must test again
7249                                          * here otherwise pausing during the delay-wait wouldn't
7250                                          * work.
7251                                          */
7252                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7253                                                 recoveryPausesHere();
7254                                 }
7255
7256                                 /* Setup error traceback support for ereport() */
7257                                 errcallback.callback = rm_redo_error_callback;
7258                                 errcallback.arg = (void *) xlogreader;
7259                                 errcallback.previous = error_context_stack;
7260                                 error_context_stack = &errcallback;
7261
7262                                 /*
7263                                  * ShmemVariableCache->nextXid must be beyond record's xid.
7264                                  *
7265                                  * We don't expect anyone else to modify nextXid, hence we
7266                                  * don't need to hold a lock while examining it.  We still
7267                                  * acquire the lock to modify it, though.
7268                                  */
7269                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
7270                                                                                                  ShmemVariableCache->nextXid))
7271                                 {
7272                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7273                                         ShmemVariableCache->nextXid = record->xl_xid;
7274                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
7275                                         LWLockRelease(XidGenLock);
7276                                 }
7277
7278                                 /*
7279                                  * Before replaying this record, check if this record causes
7280                                  * the current timeline to change. The record is already
7281                                  * considered to be part of the new timeline, so we update
7282                                  * ThisTimeLineID before replaying it. That's important so
7283                                  * that replayEndTLI, which is recorded as the minimum
7284                                  * recovery point's TLI if recovery stops after this record,
7285                                  * is set correctly.
7286                                  */
7287                                 if (record->xl_rmid == RM_XLOG_ID)
7288                                 {
7289                                         TimeLineID      newTLI = ThisTimeLineID;
7290                                         TimeLineID      prevTLI = ThisTimeLineID;
7291                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7292
7293                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
7294                                         {
7295                                                 CheckPoint      checkPoint;
7296
7297                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7298                                                 newTLI = checkPoint.ThisTimeLineID;
7299                                                 prevTLI = checkPoint.PrevTimeLineID;
7300                                         }
7301                                         else if (info == XLOG_END_OF_RECOVERY)
7302                                         {
7303                                                 xl_end_of_recovery xlrec;
7304
7305                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7306                                                 newTLI = xlrec.ThisTimeLineID;
7307                                                 prevTLI = xlrec.PrevTimeLineID;
7308                                         }
7309
7310                                         if (newTLI != ThisTimeLineID)
7311                                         {
7312                                                 /* Check that it's OK to switch to this TLI */
7313                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7314
7315                                                 /* Following WAL records should be run with new TLI */
7316                                                 ThisTimeLineID = newTLI;
7317                                                 switchedTLI = true;
7318                                         }
7319                                 }
7320
7321                                 /*
7322                                  * Update shared replayEndRecPtr before replaying this record,
7323                                  * so that XLogFlush will update minRecoveryPoint correctly.
7324                                  */
7325                                 SpinLockAcquire(&XLogCtl->info_lck);
7326                                 XLogCtl->replayEndRecPtr = EndRecPtr;
7327                                 XLogCtl->replayEndTLI = ThisTimeLineID;
7328                                 SpinLockRelease(&XLogCtl->info_lck);
7329
7330                                 /*
7331                                  * If we are attempting to enter Hot Standby mode, process
7332                                  * XIDs we see
7333                                  */
7334                                 if (standbyState >= STANDBY_INITIALIZED &&
7335                                         TransactionIdIsValid(record->xl_xid))
7336                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7337
7338                                 /* Now apply the WAL record itself */
7339                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7340
7341                                 /*
7342                                  * After redo, check whether the backup pages associated with
7343                                  * the WAL record are consistent with the existing pages. This
7344                                  * check is done only if consistency check is enabled for this
7345                                  * record.
7346                                  */
7347                                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7348                                         checkXLogConsistency(xlogreader);
7349
7350                                 /* Pop the error context stack */
7351                                 error_context_stack = errcallback.previous;
7352
7353                                 /*
7354                                  * Update lastReplayedEndRecPtr after this record has been
7355                                  * successfully replayed.
7356                                  */
7357                                 SpinLockAcquire(&XLogCtl->info_lck);
7358                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7359                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7360                                 SpinLockRelease(&XLogCtl->info_lck);
7361
7362                                 /*
7363                                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
7364                                  * up the receiver so that it notices the updated
7365                                  * lastReplayedEndRecPtr and sends a reply to the master.
7366                                  */
7367                                 if (doRequestWalReceiverReply)
7368                                 {
7369                                         doRequestWalReceiverReply = false;
7370                                         WalRcvForceReply();
7371                                 }
7372
7373                                 /* Remember this record as the last-applied one */
7374                                 LastRec = ReadRecPtr;
7375
7376                                 /* Allow read-only connections if we're consistent now */
7377                                 CheckRecoveryConsistency();
7378
7379                                 /* Is this a timeline switch? */
7380                                 if (switchedTLI)
7381                                 {
7382                                         /*
7383                                          * Before we continue on the new timeline, clean up any
7384                                          * (possibly bogus) future WAL segments on the old
7385                                          * timeline.
7386                                          */
7387                                         RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7388
7389                                         /*
7390                                          * Wake up any walsenders to notice that we are on a new
7391                                          * timeline.
7392                                          */
7393                                         if (switchedTLI && AllowCascadeReplication())
7394                                                 WalSndWakeup();
7395                                 }
7396
7397                                 /* Exit loop if we reached inclusive recovery target */
7398                                 if (recoveryStopsAfter(xlogreader))
7399                                 {
7400                                         reachedStopPoint = true;
7401                                         break;
7402                                 }
7403
7404                                 /* Else, try to fetch the next WAL record */
7405                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7406                         } while (record != NULL);
7407
7408                         /*
7409                          * end of main redo apply loop
7410                          */
7411
7412                         if (reachedStopPoint)
7413                         {
7414                                 if (!reachedConsistency)
7415                                         ereport(FATAL,
7416                                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7417
7418                                 /*
7419                                  * This is the last point where we can restart recovery with a
7420                                  * new recovery target, if we shutdown and begin again. After
7421                                  * this, Resource Managers may choose to do permanent
7422                                  * corrective actions at end of recovery.
7423                                  */
7424                                 switch (recoveryTargetAction)
7425                                 {
7426                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
7427
7428                                                 /*
7429                                                  * exit with special return code to request shutdown
7430                                                  * of postmaster.  Log messages issued from
7431                                                  * postmaster.
7432                                                  */
7433                                                 proc_exit(3);
7434
7435                                         case RECOVERY_TARGET_ACTION_PAUSE:
7436                                                 SetRecoveryPause(true);
7437                                                 recoveryPausesHere();
7438
7439                                                 /* drop into promote */
7440
7441                                         case RECOVERY_TARGET_ACTION_PROMOTE:
7442                                                 break;
7443                                 }
7444                         }
7445
7446                         /* Allow resource managers to do any required cleanup. */
7447                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7448                         {
7449                                 if (RmgrTable[rmid].rm_cleanup != NULL)
7450                                         RmgrTable[rmid].rm_cleanup();
7451                         }
7452
7453                         ereport(LOG,
7454                                         (errmsg("redo done at %X/%X",
7455                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7456                         xtime = GetLatestXTime();
7457                         if (xtime)
7458                                 ereport(LOG,
7459                                                 (errmsg("last completed transaction was at log time %s",
7460                                                                 timestamptz_to_str(xtime))));
7461
7462                         InRedo = false;
7463                 }
7464                 else
7465                 {
7466                         /* there are no WAL records following the checkpoint */
7467                         ereport(LOG,
7468                                         (errmsg("redo is not required")));
7469                 }
7470         }
7471
7472         /*
7473          * Kill WAL receiver, if it's still running, before we continue to write
7474          * the startup checkpoint record. It will trump over the checkpoint and
7475          * subsequent records if it's still alive when we start writing WAL.
7476          */
7477         ShutdownWalRcv();
7478
7479         /*
7480          * Reset unlogged relations to the contents of their INIT fork. This is
7481          * done AFTER recovery is complete so as to include any unlogged relations
7482          * created during recovery, but BEFORE recovery is marked as having
7483          * completed successfully. Otherwise we'd not retry if any of the post
7484          * end-of-recovery steps fail.
7485          */
7486         if (InRecovery)
7487                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7488
7489         /*
7490          * We don't need the latch anymore. It's not strictly necessary to disown
7491          * it, but let's do it for the sake of tidiness.
7492          */
7493         if (StandbyModeRequested)
7494                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7495
7496         /*
7497          * We are now done reading the xlog from stream. Turn off streaming
7498          * recovery to force fetching the files (which would be required at end of
7499          * recovery, e.g., timeline history file) from archive or pg_wal.
7500          */
7501         StandbyMode = false;
7502
7503         /*
7504          * Re-fetch the last valid or last applied record, so we can identify the
7505          * exact endpoint of what we consider the valid portion of WAL.
7506          */
7507         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7508         EndOfLog = EndRecPtr;
7509
7510         /*
7511          * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7512          * the end-of-log. It could be different from the timeline that EndOfLog
7513          * nominally belongs to, if there was a timeline switch in that segment,
7514          * and we were reading the old WAL from a segment belonging to a higher
7515          * timeline.
7516          */
7517         EndOfLogTLI = xlogreader->readPageTLI;
7518
7519         /*
7520          * Complain if we did not roll forward far enough to render the backup
7521          * dump consistent.  Note: it is indeed okay to look at the local variable
7522          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7523          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7524          * advanced beyond the WAL we processed.
7525          */
7526         if (InRecovery &&
7527                 (EndOfLog < minRecoveryPoint ||
7528                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7529         {
7530                 /*
7531                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7532                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7533                  * tried to recover from an online backup but never called
7534                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7535                  * point. However, this also happens in crash recovery, if the system
7536                  * crashes while an online backup is in progress. We must not treat
7537                  * that as an error, or the database will refuse to start up.
7538                  */
7539                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7540                 {
7541                         if (ControlFile->backupEndRequired)
7542                                 ereport(FATAL,
7543                                                 (errmsg("WAL ends before end of online backup"),
7544                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7545                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7546                                 ereport(FATAL,
7547                                                 (errmsg("WAL ends before end of online backup"),
7548                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7549                         else
7550                                 ereport(FATAL,
7551                                                 (errmsg("WAL ends before consistent recovery point")));
7552                 }
7553         }
7554
7555         /*
7556          * Pre-scan prepared transactions to find out the range of XIDs present.
7557          * This information is not quite needed yet, but it is positioned here so
7558          * as potential problems are detected before any on-disk change is done.
7559          */
7560         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7561
7562         /*
7563          * Consider whether we need to assign a new timeline ID.
7564          *
7565          * If we are doing an archive recovery, we always assign a new ID.  This
7566          * handles a couple of issues.  If we stopped short of the end of WAL
7567          * during recovery, then we are clearly generating a new timeline and must
7568          * assign it a unique new ID.  Even if we ran to the end, modifying the
7569          * current last segment is problematic because it may result in trying to
7570          * overwrite an already-archived copy of that segment, and we encourage
7571          * DBAs to make their archive_commands reject that.  We can dodge the
7572          * problem by making the new active segment have a new timeline ID.
7573          *
7574          * In a normal crash recovery, we can just extend the timeline we were in.
7575          */
7576         PrevTimeLineID = ThisTimeLineID;
7577         if (ArchiveRecoveryRequested)
7578         {
7579                 char            reason[200];
7580
7581                 Assert(InArchiveRecovery);
7582
7583                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7584                 ereport(LOG,
7585                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7586
7587                 /*
7588                  * Create a comment for the history file to explain why and where
7589                  * timeline changed.
7590                  */
7591                 if (recoveryTarget == RECOVERY_TARGET_XID)
7592                         snprintf(reason, sizeof(reason),
7593                                          "%s transaction %u",
7594                                          recoveryStopAfter ? "after" : "before",
7595                                          recoveryStopXid);
7596                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7597                         snprintf(reason, sizeof(reason),
7598                                          "%s %s\n",
7599                                          recoveryStopAfter ? "after" : "before",
7600                                          timestamptz_to_str(recoveryStopTime));
7601                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7602                         snprintf(reason, sizeof(reason),
7603                                          "%s LSN %X/%X\n",
7604                                          recoveryStopAfter ? "after" : "before",
7605                                          (uint32) (recoveryStopLSN >> 32),
7606                                          (uint32) recoveryStopLSN);
7607                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7608                         snprintf(reason, sizeof(reason),
7609                                          "at restore point \"%s\"",
7610                                          recoveryStopName);
7611                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7612                         snprintf(reason, sizeof(reason), "reached consistency");
7613                 else
7614                         snprintf(reason, sizeof(reason), "no recovery target specified");
7615
7616                 /*
7617                  * We are now done reading the old WAL.  Turn off archive fetching if
7618                  * it was active, and make a writable copy of the last WAL segment.
7619                  * (Note that we also have a copy of the last block of the old WAL in
7620                  * readBuf; we will use that below.)
7621                  */
7622                 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7623
7624                 /*
7625                  * Write the timeline history file, and have it archived. After this
7626                  * point (or rather, as soon as the file is archived), the timeline
7627                  * will appear as "taken" in the WAL archive and to any standby
7628                  * servers.  If we crash before actually switching to the new
7629                  * timeline, standby servers will nevertheless think that we switched
7630                  * to the new timeline, and will try to connect to the new timeline.
7631                  * To minimize the window for that, try to do as little as possible
7632                  * between here and writing the end-of-recovery record.
7633                  */
7634                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7635                                                          EndRecPtr, reason);
7636         }
7637
7638         /* Save the selected TimeLineID in shared memory, too */
7639         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7640         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7641
7642         /*
7643          * Prepare to write WAL starting at EndOfLog location, and init xlog
7644          * buffer cache using the block containing the last record from the
7645          * previous incarnation.
7646          */
7647         Insert = &XLogCtl->Insert;
7648         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7649         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7650
7651         /*
7652          * Tricky point here: readBuf contains the *last* block that the LastRec
7653          * record spans, not the one it starts in.  The last block is indeed the
7654          * one we want to use.
7655          */
7656         if (EndOfLog % XLOG_BLCKSZ != 0)
7657         {
7658                 char       *page;
7659                 int                     len;
7660                 int                     firstIdx;
7661                 XLogRecPtr      pageBeginPtr;
7662
7663                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7664                 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7665
7666                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7667
7668                 /* Copy the valid part of the last block, and zero the rest */
7669                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7670                 len = EndOfLog % XLOG_BLCKSZ;
7671                 memcpy(page, xlogreader->readBuf, len);
7672                 memset(page + len, 0, XLOG_BLCKSZ - len);
7673
7674                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7675                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7676         }
7677         else
7678         {
7679                 /*
7680                  * There is no partial block to copy. Just set InitializedUpTo, and
7681                  * let the first attempt to insert a log record to initialize the next
7682                  * buffer.
7683                  */
7684                 XLogCtl->InitializedUpTo = EndOfLog;
7685         }
7686
7687         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7688
7689         XLogCtl->LogwrtResult = LogwrtResult;
7690
7691         XLogCtl->LogwrtRqst.Write = EndOfLog;
7692         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7693
7694         /*
7695          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7696          * record before resource manager writes cleanup WAL records or checkpoint
7697          * record is written.
7698          */
7699         Insert->fullPageWrites = lastFullPageWrites;
7700         LocalSetXLogInsertAllowed();
7701         UpdateFullPageWrites();
7702         LocalXLogInsertAllowed = -1;
7703
7704         if (InRecovery)
7705         {
7706                 /*
7707                  * Perform a checkpoint to update all our recovery activity to disk.
7708                  *
7709                  * Note that we write a shutdown checkpoint rather than an on-line
7710                  * one. This is not particularly critical, but since we may be
7711                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7712                  * the rule that TLI only changes in shutdown checkpoints, which
7713                  * allows some extra error checking in xlog_redo.
7714                  *
7715                  * In fast promotion, only create a lightweight end-of-recovery record
7716                  * instead of a full checkpoint. A checkpoint is requested later,
7717                  * after we're fully out of recovery mode and already accepting
7718                  * queries.
7719                  */
7720                 if (bgwriterLaunched)
7721                 {
7722                         if (fast_promote)
7723                         {
7724                                 checkPointLoc = ControlFile->checkPoint;
7725
7726                                 /*
7727                                  * Confirm the last checkpoint is available for us to recover
7728                                  * from if we fail.
7729                                  */
7730                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7731                                 if (record != NULL)
7732                                 {
7733                                         fast_promoted = true;
7734
7735                                         /*
7736                                          * Insert a special WAL record to mark the end of
7737                                          * recovery, since we aren't doing a checkpoint. That
7738                                          * means that the checkpointer process may likely be in
7739                                          * the middle of a time-smoothed restartpoint and could
7740                                          * continue to be for minutes after this. That sounds
7741                                          * strange, but the effect is roughly the same and it
7742                                          * would be stranger to try to come out of the
7743                                          * restartpoint and then checkpoint. We request a
7744                                          * checkpoint later anyway, just for safety.
7745                                          */
7746                                         CreateEndOfRecoveryRecord();
7747                                 }
7748                         }
7749
7750                         if (!fast_promoted)
7751                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7752                                                                   CHECKPOINT_IMMEDIATE |
7753                                                                   CHECKPOINT_WAIT);
7754                 }
7755                 else
7756                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7757
7758                 /*
7759                  * And finally, execute the recovery_end_command, if any.
7760                  */
7761                 if (recoveryEndCommand)
7762                         ExecuteRecoveryCommand(recoveryEndCommand,
7763                                                                    "recovery_end_command",
7764                                                                    true);
7765         }
7766
7767         if (ArchiveRecoveryRequested)
7768         {
7769                 /*
7770                  * We switched to a new timeline. Clean up segments on the old
7771                  * timeline.
7772                  *
7773                  * If there are any higher-numbered segments on the old timeline,
7774                  * remove them. They might contain valid WAL, but they might also be
7775                  * pre-allocated files containing garbage. In any case, they are not
7776                  * part of the new timeline's history so we don't need them.
7777                  */
7778                 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7779
7780                 /*
7781                  * If the switch happened in the middle of a segment, what to do with
7782                  * the last, partial segment on the old timeline? If we don't archive
7783                  * it, and the server that created the WAL never archives it either
7784                  * (e.g. because it was hit by a meteor), it will never make it to the
7785                  * archive. That's OK from our point of view, because the new segment
7786                  * that we created with the new TLI contains all the WAL from the old
7787                  * timeline up to the switch point. But if you later try to do PITR to
7788                  * the "missing" WAL on the old timeline, recovery won't find it in
7789                  * the archive. It's physically present in the new file with new TLI,
7790                  * but recovery won't look there when it's recovering to the older
7791                  * timeline. On the other hand, if we archive the partial segment, and
7792                  * the original server on that timeline is still running and archives
7793                  * the completed version of the same segment later, it will fail. (We
7794                  * used to do that in 9.4 and below, and it caused such problems).
7795                  *
7796                  * As a compromise, we rename the last segment with the .partial
7797                  * suffix, and archive it. Archive recovery will never try to read
7798                  * .partial segments, so they will normally go unused. But in the odd
7799                  * PITR case, the administrator can copy them manually to the pg_wal
7800                  * directory (removing the suffix). They can be useful in debugging,
7801                  * too.
7802                  *
7803                  * If a .done or .ready file already exists for the old timeline,
7804                  * however, we had already determined that the segment is complete, so
7805                  * we can let it be archived normally. (In particular, if it was
7806                  * restored from the archive to begin with, it's expected to have a
7807                  * .done file).
7808                  */
7809                 if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7810                         XLogArchivingActive())
7811                 {
7812                         char            origfname[MAXFNAMELEN];
7813                         XLogSegNo       endLogSegNo;
7814
7815                         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7816                         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7817
7818                         if (!XLogArchiveIsReadyOrDone(origfname))
7819                         {
7820                                 char            origpath[MAXPGPATH];
7821                                 char            partialfname[MAXFNAMELEN];
7822                                 char            partialpath[MAXPGPATH];
7823
7824                                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7825                                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7826                                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7827
7828                                 /*
7829                                  * Make sure there's no .done or .ready file for the .partial
7830                                  * file.
7831                                  */
7832                                 XLogArchiveCleanup(partialfname);
7833
7834                                 durable_rename(origpath, partialpath, ERROR);
7835                                 XLogArchiveNotify(partialfname);
7836                         }
7837                 }
7838         }
7839
7840         /*
7841          * Preallocate additional log files, if wanted.
7842          */
7843         PreallocXlogFiles(EndOfLog);
7844
7845         /*
7846          * Okay, we're officially UP.
7847          */
7848         InRecovery = false;
7849
7850         /* start the archive_timeout timer and LSN running */
7851         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7852         XLogCtl->lastSegSwitchLSN = EndOfLog;
7853
7854         /* also initialize latestCompletedXid, to nextXid - 1 */
7855         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7856         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7857         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7858         LWLockRelease(ProcArrayLock);
7859
7860         /*
7861          * Start up the commit log and subtrans, if not already done for hot
7862          * standby.  (commit timestamps are started below, if necessary.)
7863          */
7864         if (standbyState == STANDBY_DISABLED)
7865         {
7866                 StartupCLOG();
7867                 StartupSUBTRANS(oldestActiveXID);
7868         }
7869
7870         /*
7871          * Perform end of recovery actions for any SLRUs that need it.
7872          */
7873         TrimCLOG();
7874         TrimMultiXact();
7875
7876         /* Reload shared-memory state for prepared transactions */
7877         RecoverPreparedTransactions();
7878
7879         /*
7880          * Shutdown the recovery environment. This must occur after
7881          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7882          */
7883         if (standbyState != STANDBY_DISABLED)
7884                 ShutdownRecoveryTransactionEnvironment();
7885
7886         /* Shut down xlogreader */
7887         if (readFile >= 0)
7888         {
7889                 close(readFile);
7890                 readFile = -1;
7891         }
7892         XLogReaderFree(xlogreader);
7893
7894         /*
7895          * If any of the critical GUCs have changed, log them before we allow
7896          * backends to write WAL.
7897          */
7898         LocalSetXLogInsertAllowed();
7899         XLogReportParameters();
7900
7901         /*
7902          * Local WAL inserts enabled, so it's time to finish initialization of
7903          * commit timestamp.
7904          */
7905         CompleteCommitTsInitialization();
7906
7907         /*
7908          * All done with end-of-recovery actions.
7909          *
7910          * Now allow backends to write WAL and update the control file status in
7911          * consequence.  The boolean flag allowing backends to write WAL is
7912          * updated while holding ControlFileLock to prevent other backends to look
7913          * at an inconsistent state of the control file in shared memory.  There
7914          * is still a small window during which backends can write WAL and the
7915          * control file is still referring to a system not in DB_IN_PRODUCTION
7916          * state while looking at the on-disk control file.
7917          *
7918          * Also, although the boolean flag to allow WAL is probably atomic in
7919          * itself, we use the info_lck here to ensure that there are no race
7920          * conditions concerning visibility of other recent updates to shared
7921          * memory.
7922          */
7923         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7924         ControlFile->state = DB_IN_PRODUCTION;
7925         ControlFile->time = (pg_time_t) time(NULL);
7926
7927         SpinLockAcquire(&XLogCtl->info_lck);
7928         XLogCtl->SharedRecoveryInProgress = false;
7929         SpinLockRelease(&XLogCtl->info_lck);
7930
7931         UpdateControlFile();
7932         LWLockRelease(ControlFileLock);
7933
7934         /*
7935          * If there were cascading standby servers connected to us, nudge any wal
7936          * sender processes to notice that we've been promoted.
7937          */
7938         WalSndWakeup();
7939
7940         /*
7941          * If this was a fast promotion, request an (online) checkpoint now. This
7942          * isn't required for consistency, but the last restartpoint might be far
7943          * back, and in case of a crash, recovering from it might take a longer
7944          * than is appropriate now that we're not in standby mode anymore.
7945          */
7946         if (fast_promoted)
7947                 RequestCheckpoint(CHECKPOINT_FORCE);
7948 }
7949
7950 /*
7951  * Checks if recovery has reached a consistent state. When consistency is
7952  * reached and we have a valid starting standby snapshot, tell postmaster
7953  * that it can start accepting read-only connections.
7954  */
7955 static void
7956 CheckRecoveryConsistency(void)
7957 {
7958         XLogRecPtr      lastReplayedEndRecPtr;
7959
7960         /*
7961          * During crash recovery, we don't reach a consistent state until we've
7962          * replayed all the WAL.
7963          */
7964         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7965                 return;
7966
7967         Assert(InArchiveRecovery);
7968
7969         /*
7970          * assume that we are called in the startup process, and hence don't need
7971          * a lock to read lastReplayedEndRecPtr
7972          */
7973         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7974
7975         /*
7976          * Have we reached the point where our base backup was completed?
7977          */
7978         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7979                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7980         {
7981                 /*
7982                  * We have reached the end of base backup, as indicated by pg_control.
7983                  * The data on disk is now consistent. Reset backupStartPoint and
7984                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7985                  * allow starting up at an earlier point even if recovery is stopped
7986                  * and restarted soon after this.
7987                  */
7988                 elog(DEBUG1, "end of backup reached");
7989
7990                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7991
7992                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7993                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7994
7995                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7996                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7997                 ControlFile->backupEndRequired = false;
7998                 UpdateControlFile();
7999
8000                 LWLockRelease(ControlFileLock);
8001         }
8002
8003         /*
8004          * Have we passed our safe starting point? Note that minRecoveryPoint is
8005          * known to be incorrectly set if ControlFile->backupEndRequired, until
8006          * the XLOG_BACKUP_END arrives to advise us of the correct
8007          * minRecoveryPoint. All we know prior to that is that we're not
8008          * consistent yet.
8009          */
8010         if (!reachedConsistency && !ControlFile->backupEndRequired &&
8011                 minRecoveryPoint <= lastReplayedEndRecPtr &&
8012                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8013         {
8014                 /*
8015                  * Check to see if the XLOG sequence contained any unresolved
8016                  * references to uninitialized pages.
8017                  */
8018                 XLogCheckInvalidPages();
8019
8020                 reachedConsistency = true;
8021                 ereport(LOG,
8022                                 (errmsg("consistent recovery state reached at %X/%X",
8023                                                 (uint32) (lastReplayedEndRecPtr >> 32),
8024                                                 (uint32) lastReplayedEndRecPtr)));
8025         }
8026
8027         /*
8028          * Have we got a valid starting snapshot that will allow queries to be
8029          * run? If so, we can tell postmaster that the database is consistent now,
8030          * enabling connections.
8031          */
8032         if (standbyState == STANDBY_SNAPSHOT_READY &&
8033                 !LocalHotStandbyActive &&
8034                 reachedConsistency &&
8035                 IsUnderPostmaster)
8036         {
8037                 SpinLockAcquire(&XLogCtl->info_lck);
8038                 XLogCtl->SharedHotStandbyActive = true;
8039                 SpinLockRelease(&XLogCtl->info_lck);
8040
8041                 LocalHotStandbyActive = true;
8042
8043                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8044         }
8045 }
8046
8047 /*
8048  * Is the system still in recovery?
8049  *
8050  * Unlike testing InRecovery, this works in any process that's connected to
8051  * shared memory.
8052  *
8053  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8054  * variables the first time we see that recovery is finished.
8055  */
8056 bool
8057 RecoveryInProgress(void)
8058 {
8059         /*
8060          * We check shared state each time only until we leave recovery mode. We
8061          * can't re-enter recovery, so there's no need to keep checking after the
8062          * shared variable has once been seen false.
8063          */
8064         if (!LocalRecoveryInProgress)
8065                 return false;
8066         else
8067         {
8068                 /*
8069                  * use volatile pointer to make sure we make a fresh read of the
8070                  * shared variable.
8071                  */
8072                 volatile XLogCtlData *xlogctl = XLogCtl;
8073
8074                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
8075
8076                 /*
8077                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8078                  * is finished. InitPostgres() relies upon this behaviour to ensure
8079                  * that InitXLOGAccess() is called at backend startup.  (If you change
8080                  * this, see also LocalSetXLogInsertAllowed.)
8081                  */
8082                 if (!LocalRecoveryInProgress)
8083                 {
8084                         /*
8085                          * If we just exited recovery, make sure we read TimeLineID and
8086                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
8087                          * weak memory ordering).
8088                          */
8089                         pg_memory_barrier();
8090                         InitXLOGAccess();
8091                 }
8092
8093                 /*
8094                  * Note: We don't need a memory barrier when we're still in recovery.
8095                  * We might exit recovery immediately after return, so the caller
8096                  * can't rely on 'true' meaning that we're still in recovery anyway.
8097                  */
8098
8099                 return LocalRecoveryInProgress;
8100         }
8101 }
8102
8103 /*
8104  * Is HotStandby active yet? This is only important in special backends
8105  * since normal backends won't ever be able to connect until this returns
8106  * true. Postmaster knows this by way of signal, not via shared memory.
8107  *
8108  * Unlike testing standbyState, this works in any process that's connected to
8109  * shared memory.  (And note that standbyState alone doesn't tell the truth
8110  * anyway.)
8111  */
8112 bool
8113 HotStandbyActive(void)
8114 {
8115         /*
8116          * We check shared state each time only until Hot Standby is active. We
8117          * can't de-activate Hot Standby, so there's no need to keep checking
8118          * after the shared variable has once been seen true.
8119          */
8120         if (LocalHotStandbyActive)
8121                 return true;
8122         else
8123         {
8124                 /* spinlock is essential on machines with weak memory ordering! */
8125                 SpinLockAcquire(&XLogCtl->info_lck);
8126                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8127                 SpinLockRelease(&XLogCtl->info_lck);
8128
8129                 return LocalHotStandbyActive;
8130         }
8131 }
8132
8133 /*
8134  * Like HotStandbyActive(), but to be used only in WAL replay code,
8135  * where we don't need to ask any other process what the state is.
8136  */
8137 bool
8138 HotStandbyActiveInReplay(void)
8139 {
8140         Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8141         return LocalHotStandbyActive;
8142 }
8143
8144 /*
8145  * Is this process allowed to insert new WAL records?
8146  *
8147  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8148  * But we also have provisions for forcing the result "true" or "false"
8149  * within specific processes regardless of the global state.
8150  */
8151 bool
8152 XLogInsertAllowed(void)
8153 {
8154         /*
8155          * If value is "unconditionally true" or "unconditionally false", just
8156          * return it.  This provides the normal fast path once recovery is known
8157          * done.
8158          */
8159         if (LocalXLogInsertAllowed >= 0)
8160                 return (bool) LocalXLogInsertAllowed;
8161
8162         /*
8163          * Else, must check to see if we're still in recovery.
8164          */
8165         if (RecoveryInProgress())
8166                 return false;
8167
8168         /*
8169          * On exit from recovery, reset to "unconditionally true", since there is
8170          * no need to keep checking.
8171          */
8172         LocalXLogInsertAllowed = 1;
8173         return true;
8174 }
8175
8176 /*
8177  * Make XLogInsertAllowed() return true in the current process only.
8178  *
8179  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8180  * and even call LocalSetXLogInsertAllowed() again after that.
8181  */
8182 static void
8183 LocalSetXLogInsertAllowed(void)
8184 {
8185         Assert(LocalXLogInsertAllowed == -1);
8186         LocalXLogInsertAllowed = 1;
8187
8188         /* Initialize as RecoveryInProgress() would do when switching state */
8189         InitXLOGAccess();
8190 }
8191
8192 /*
8193  * Subroutine to try to fetch and validate a prior checkpoint record.
8194  *
8195  * whichChkpt identifies the checkpoint (merely for reporting purposes).
8196  * 1 for "primary", 0 for "other" (backup_label)
8197  */
8198 static XLogRecord *
8199 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8200                                          int whichChkpt, bool report)
8201 {
8202         XLogRecord *record;
8203         uint8           info;
8204
8205         if (!XRecOffIsValid(RecPtr))
8206         {
8207                 if (!report)
8208                         return NULL;
8209
8210                 switch (whichChkpt)
8211                 {
8212                         case 1:
8213                                 ereport(LOG,
8214                                                 (errmsg("invalid primary checkpoint link in control file")));
8215                                 break;
8216                         default:
8217                                 ereport(LOG,
8218                                                 (errmsg("invalid checkpoint link in backup_label file")));
8219                                 break;
8220                 }
8221                 return NULL;
8222         }
8223
8224         record = ReadRecord(xlogreader, RecPtr, LOG, true);
8225
8226         if (record == NULL)
8227         {
8228                 if (!report)
8229                         return NULL;
8230
8231                 switch (whichChkpt)
8232                 {
8233                         case 1:
8234                                 ereport(LOG,
8235                                                 (errmsg("invalid primary checkpoint record")));
8236                                 break;
8237                         default:
8238                                 ereport(LOG,
8239                                                 (errmsg("invalid checkpoint record")));
8240                                 break;
8241                 }
8242                 return NULL;
8243         }
8244         if (record->xl_rmid != RM_XLOG_ID)
8245         {
8246                 switch (whichChkpt)
8247                 {
8248                         case 1:
8249                                 ereport(LOG,
8250                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
8251                                 break;
8252                         default:
8253                                 ereport(LOG,
8254                                                 (errmsg("invalid resource manager ID in checkpoint record")));
8255                                 break;
8256                 }
8257                 return NULL;
8258         }
8259         info = record->xl_info & ~XLR_INFO_MASK;
8260         if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8261                 info != XLOG_CHECKPOINT_ONLINE)
8262         {
8263                 switch (whichChkpt)
8264                 {
8265                         case 1:
8266                                 ereport(LOG,
8267                                                 (errmsg("invalid xl_info in primary checkpoint record")));
8268                                 break;
8269                         default:
8270                                 ereport(LOG,
8271                                                 (errmsg("invalid xl_info in checkpoint record")));
8272                                 break;
8273                 }
8274                 return NULL;
8275         }
8276         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8277         {
8278                 switch (whichChkpt)
8279                 {
8280                         case 1:
8281                                 ereport(LOG,
8282                                                 (errmsg("invalid length of primary checkpoint record")));
8283                                 break;
8284                         default:
8285                                 ereport(LOG,
8286                                                 (errmsg("invalid length of checkpoint record")));
8287                                 break;
8288                 }
8289                 return NULL;
8290         }
8291         return record;
8292 }
8293
8294 /*
8295  * This must be called in a backend process before creating WAL records
8296  * (except in a standalone backend, which does StartupXLOG instead).  We need
8297  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8298  *
8299  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8300  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8301  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8302  */
8303 void
8304 InitXLOGAccess(void)
8305 {
8306         XLogCtlInsert *Insert = &XLogCtl->Insert;
8307
8308         /* ThisTimeLineID doesn't change so we need no lock to copy it */
8309         ThisTimeLineID = XLogCtl->ThisTimeLineID;
8310         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8311
8312         /* set wal_segment_size */
8313         wal_segment_size = ControlFile->xlog_seg_size;
8314
8315         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8316         (void) GetRedoRecPtr();
8317         /* Also update our copy of doPageWrites. */
8318         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8319
8320         /* Also initialize the working areas for constructing WAL records */
8321         InitXLogInsert();
8322 }
8323
8324 /*
8325  * Return the current Redo pointer from shared memory.
8326  *
8327  * As a side-effect, the local RedoRecPtr copy is updated.
8328  */
8329 XLogRecPtr
8330 GetRedoRecPtr(void)
8331 {
8332         XLogRecPtr      ptr;
8333
8334         /*
8335          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8336          * grabbed a WAL insertion lock to read the master copy, someone might
8337          * update it just after we've released the lock.
8338          */
8339         SpinLockAcquire(&XLogCtl->info_lck);
8340         ptr = XLogCtl->RedoRecPtr;
8341         SpinLockRelease(&XLogCtl->info_lck);
8342
8343         if (RedoRecPtr < ptr)
8344                 RedoRecPtr = ptr;
8345
8346         return RedoRecPtr;
8347 }
8348
8349 /*
8350  * Return information needed to decide whether a modified block needs a
8351  * full-page image to be included in the WAL record.
8352  *
8353  * The returned values are cached copies from backend-private memory, and
8354  * possibly out-of-date.  XLogInsertRecord will re-check them against
8355  * up-to-date values, while holding the WAL insert lock.
8356  */
8357 void
8358 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8359 {
8360         *RedoRecPtr_p = RedoRecPtr;
8361         *doPageWrites_p = doPageWrites;
8362 }
8363
8364 /*
8365  * GetInsertRecPtr -- Returns the current insert position.
8366  *
8367  * NOTE: The value *actually* returned is the position of the last full
8368  * xlog page. It lags behind the real insert position by at most 1 page.
8369  * For that, we don't need to scan through WAL insertion locks, and an
8370  * approximation is enough for the current usage of this function.
8371  */
8372 XLogRecPtr
8373 GetInsertRecPtr(void)
8374 {
8375         XLogRecPtr      recptr;
8376
8377         SpinLockAcquire(&XLogCtl->info_lck);
8378         recptr = XLogCtl->LogwrtRqst.Write;
8379         SpinLockRelease(&XLogCtl->info_lck);
8380
8381         return recptr;
8382 }
8383
8384 /*
8385  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8386  * position known to be fsync'd to disk.
8387  */
8388 XLogRecPtr
8389 GetFlushRecPtr(void)
8390 {
8391         SpinLockAcquire(&XLogCtl->info_lck);
8392         LogwrtResult = XLogCtl->LogwrtResult;
8393         SpinLockRelease(&XLogCtl->info_lck);
8394
8395         return LogwrtResult.Flush;
8396 }
8397
8398 /*
8399  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8400  * inserted. All records not explicitly marked as unimportant are considered
8401  * important.
8402  *
8403  * The LSN is determined by computing the maximum of
8404  * WALInsertLocks[i].lastImportantAt.
8405  */
8406 XLogRecPtr
8407 GetLastImportantRecPtr(void)
8408 {
8409         XLogRecPtr      res = InvalidXLogRecPtr;
8410         int                     i;
8411
8412         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8413         {
8414                 XLogRecPtr      last_important;
8415
8416                 /*
8417                  * Need to take a lock to prevent torn reads of the LSN, which are
8418                  * possible on some of the supported platforms. WAL insert locks only
8419                  * support exclusive mode, so we have to use that.
8420                  */
8421                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8422                 last_important = WALInsertLocks[i].l.lastImportantAt;
8423                 LWLockRelease(&WALInsertLocks[i].l.lock);
8424
8425                 if (res < last_important)
8426                         res = last_important;
8427         }
8428
8429         return res;
8430 }
8431
8432 /*
8433  * Get the time and LSN of the last xlog segment switch
8434  */
8435 pg_time_t
8436 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8437 {
8438         pg_time_t       result;
8439
8440         /* Need WALWriteLock, but shared lock is sufficient */
8441         LWLockAcquire(WALWriteLock, LW_SHARED);
8442         result = XLogCtl->lastSegSwitchTime;
8443         *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8444         LWLockRelease(WALWriteLock);
8445
8446         return result;
8447 }
8448
8449 /*
8450  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8451  *
8452  * This is exported for use by code that would like to have 64-bit XIDs.
8453  * We don't really support such things, but all XIDs within the system
8454  * can be presumed "close to" the result, and thus the epoch associated
8455  * with them can be determined.
8456  */
8457 void
8458 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8459 {
8460         uint32          ckptXidEpoch;
8461         TransactionId ckptXid;
8462         TransactionId nextXid;
8463
8464         /* Must read checkpoint info first, else have race condition */
8465         SpinLockAcquire(&XLogCtl->info_lck);
8466         ckptXidEpoch = XLogCtl->ckptXidEpoch;
8467         ckptXid = XLogCtl->ckptXid;
8468         SpinLockRelease(&XLogCtl->info_lck);
8469
8470         /* Now fetch current nextXid */
8471         nextXid = ReadNewTransactionId();
8472
8473         /*
8474          * nextXid is certainly logically later than ckptXid.  So if it's
8475          * numerically less, it must have wrapped into the next epoch.
8476          */
8477         if (nextXid < ckptXid)
8478                 ckptXidEpoch++;
8479
8480         *xid = nextXid;
8481         *epoch = ckptXidEpoch;
8482 }
8483
8484 /*
8485  * This must be called ONCE during postmaster or standalone-backend shutdown
8486  */
8487 void
8488 ShutdownXLOG(int code, Datum arg)
8489 {
8490         /*
8491          * We should have an aux process resource owner to use, and we should not
8492          * be in a transaction that's installed some other resowner.
8493          */
8494         Assert(AuxProcessResourceOwner != NULL);
8495         Assert(CurrentResourceOwner == NULL ||
8496                    CurrentResourceOwner == AuxProcessResourceOwner);
8497         CurrentResourceOwner = AuxProcessResourceOwner;
8498
8499         /* Don't be chatty in standalone mode */
8500         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8501                         (errmsg("shutting down")));
8502
8503         /*
8504          * Signal walsenders to move to stopping state.
8505          */
8506         WalSndInitStopping();
8507
8508         /*
8509          * Wait for WAL senders to be in stopping state.  This prevents commands
8510          * from writing new WAL.
8511          */
8512         WalSndWaitStopping();
8513
8514         if (RecoveryInProgress())
8515                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8516         else
8517         {
8518                 /*
8519                  * If archiving is enabled, rotate the last XLOG file so that all the
8520                  * remaining records are archived (postmaster wakes up the archiver
8521                  * process one more time at the end of shutdown). The checkpoint
8522                  * record will go to the next XLOG file and won't be archived (yet).
8523                  */
8524                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8525                         RequestXLogSwitch(false);
8526
8527                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8528         }
8529         ShutdownCLOG();
8530         ShutdownCommitTs();
8531         ShutdownSUBTRANS();
8532         ShutdownMultiXact();
8533 }
8534
8535 /*
8536  * Log start of a checkpoint.
8537  */
8538 static void
8539 LogCheckpointStart(int flags, bool restartpoint)
8540 {
8541         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8542                  restartpoint ? "restartpoint" : "checkpoint",
8543                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8544                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8545                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8546                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8547                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8548                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8549                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8550                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8551 }
8552
8553 /*
8554  * Log end of a checkpoint.
8555  */
8556 static void
8557 LogCheckpointEnd(bool restartpoint)
8558 {
8559         long            write_secs,
8560                                 sync_secs,
8561                                 total_secs,
8562                                 longest_secs,
8563                                 average_secs;
8564         int                     write_usecs,
8565                                 sync_usecs,
8566                                 total_usecs,
8567                                 longest_usecs,
8568                                 average_usecs;
8569         uint64          average_sync_time;
8570
8571         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8572
8573         TimestampDifference(CheckpointStats.ckpt_write_t,
8574                                                 CheckpointStats.ckpt_sync_t,
8575                                                 &write_secs, &write_usecs);
8576
8577         TimestampDifference(CheckpointStats.ckpt_sync_t,
8578                                                 CheckpointStats.ckpt_sync_end_t,
8579                                                 &sync_secs, &sync_usecs);
8580
8581         /* Accumulate checkpoint timing summary data, in milliseconds. */
8582         BgWriterStats.m_checkpoint_write_time +=
8583                 write_secs * 1000 + write_usecs / 1000;
8584         BgWriterStats.m_checkpoint_sync_time +=
8585                 sync_secs * 1000 + sync_usecs / 1000;
8586
8587         /*
8588          * All of the published timing statistics are accounted for.  Only
8589          * continue if a log message is to be written.
8590          */
8591         if (!log_checkpoints)
8592                 return;
8593
8594         TimestampDifference(CheckpointStats.ckpt_start_t,
8595                                                 CheckpointStats.ckpt_end_t,
8596                                                 &total_secs, &total_usecs);
8597
8598         /*
8599          * Timing values returned from CheckpointStats are in microseconds.
8600          * Convert to the second plus microsecond form that TimestampDifference
8601          * returns for homogeneous printing.
8602          */
8603         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8604         longest_usecs = CheckpointStats.ckpt_longest_sync -
8605                 (uint64) longest_secs * 1000000;
8606
8607         average_sync_time = 0;
8608         if (CheckpointStats.ckpt_sync_rels > 0)
8609                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8610                         CheckpointStats.ckpt_sync_rels;
8611         average_secs = (long) (average_sync_time / 1000000);
8612         average_usecs = average_sync_time - (uint64) average_secs * 1000000;
8613
8614         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8615                  "%d WAL file(s) added, %d removed, %d recycled; "
8616                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8617                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8618                  "distance=%d kB, estimate=%d kB",
8619                  restartpoint ? "restartpoint" : "checkpoint",
8620                  CheckpointStats.ckpt_bufs_written,
8621                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8622                  CheckpointStats.ckpt_segs_added,
8623                  CheckpointStats.ckpt_segs_removed,
8624                  CheckpointStats.ckpt_segs_recycled,
8625                  write_secs, write_usecs / 1000,
8626                  sync_secs, sync_usecs / 1000,
8627                  total_secs, total_usecs / 1000,
8628                  CheckpointStats.ckpt_sync_rels,
8629                  longest_secs, longest_usecs / 1000,
8630                  average_secs, average_usecs / 1000,
8631                  (int) (PrevCheckPointDistance / 1024.0),
8632                  (int) (CheckPointDistanceEstimate / 1024.0));
8633 }
8634
8635 /*
8636  * Update the estimate of distance between checkpoints.
8637  *
8638  * The estimate is used to calculate the number of WAL segments to keep
8639  * preallocated, see XLOGFileSlop().
8640  */
8641 static void
8642 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8643 {
8644         /*
8645          * To estimate the number of segments consumed between checkpoints, keep a
8646          * moving average of the amount of WAL generated in previous checkpoint
8647          * cycles. However, if the load is bursty, with quiet periods and busy
8648          * periods, we want to cater for the peak load. So instead of a plain
8649          * moving average, let the average decline slowly if the previous cycle
8650          * used less WAL than estimated, but bump it up immediately if it used
8651          * more.
8652          *
8653          * When checkpoints are triggered by max_wal_size, this should converge to
8654          * CheckpointSegments * wal_segment_size,
8655          *
8656          * Note: This doesn't pay any attention to what caused the checkpoint.
8657          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8658          * starting a base backup, are counted the same as those created
8659          * automatically. The slow-decline will largely mask them out, if they are
8660          * not frequent. If they are frequent, it seems reasonable to count them
8661          * in as any others; if you issue a manual checkpoint every 5 minutes and
8662          * never let a timed checkpoint happen, it makes sense to base the
8663          * preallocation on that 5 minute interval rather than whatever
8664          * checkpoint_timeout is set to.
8665          */
8666         PrevCheckPointDistance = nbytes;
8667         if (CheckPointDistanceEstimate < nbytes)
8668                 CheckPointDistanceEstimate = nbytes;
8669         else
8670                 CheckPointDistanceEstimate =
8671                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8672 }
8673
8674 /*
8675  * Perform a checkpoint --- either during shutdown, or on-the-fly
8676  *
8677  * flags is a bitwise OR of the following:
8678  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8679  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8680  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8681  *              ignoring checkpoint_completion_target parameter.
8682  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8683  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8684  *              CHECKPOINT_END_OF_RECOVERY).
8685  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8686  *
8687  * Note: flags contains other bits, of interest here only for logging purposes.
8688  * In particular note that this routine is synchronous and does not pay
8689  * attention to CHECKPOINT_WAIT.
8690  *
8691  * If !shutdown then we are writing an online checkpoint. This is a very special
8692  * kind of operation and WAL record because the checkpoint action occurs over
8693  * a period of time yet logically occurs at just a single LSN. The logical
8694  * position of the WAL record (redo ptr) is the same or earlier than the
8695  * physical position. When we replay WAL we locate the checkpoint via its
8696  * physical position then read the redo ptr and actually start replay at the
8697  * earlier logical position. Note that we don't write *anything* to WAL at
8698  * the logical position, so that location could be any other kind of WAL record.
8699  * All of this mechanism allows us to continue working while we checkpoint.
8700  * As a result, timing of actions is critical here and be careful to note that
8701  * this function will likely take minutes to execute on a busy system.
8702  */
8703 void
8704 CreateCheckPoint(int flags)
8705 {
8706         bool            shutdown;
8707         CheckPoint      checkPoint;
8708         XLogRecPtr      recptr;
8709         XLogCtlInsert *Insert = &XLogCtl->Insert;
8710         uint32          freespace;
8711         XLogRecPtr      PriorRedoPtr;
8712         XLogRecPtr      curInsert;
8713         XLogRecPtr      last_important_lsn;
8714         VirtualTransactionId *vxids;
8715         int                     nvxids;
8716
8717         /*
8718          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8719          * issued at a different time.
8720          */
8721         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8722                 shutdown = true;
8723         else
8724                 shutdown = false;
8725
8726         /* sanity check */
8727         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8728                 elog(ERROR, "can't create a checkpoint during recovery");
8729
8730         /*
8731          * Initialize InitXLogInsert working areas before entering the critical
8732          * section.  Normally, this is done by the first call to
8733          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8734          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8735          * done below in a critical section, and InitXLogInsert cannot be called
8736          * in a critical section.
8737          */
8738         InitXLogInsert();
8739
8740         /*
8741          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8742          * (This is just pro forma, since in the present system structure there is
8743          * only one process that is allowed to issue checkpoints at any given
8744          * time.)
8745          */
8746         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8747
8748         /*
8749          * Prepare to accumulate statistics.
8750          *
8751          * Note: because it is possible for log_checkpoints to change while a
8752          * checkpoint proceeds, we always accumulate stats, even if
8753          * log_checkpoints is currently off.
8754          */
8755         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8756         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8757
8758         /*
8759          * Use a critical section to force system panic if we have trouble.
8760          */
8761         START_CRIT_SECTION();
8762
8763         if (shutdown)
8764         {
8765                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8766                 ControlFile->state = DB_SHUTDOWNING;
8767                 ControlFile->time = (pg_time_t) time(NULL);
8768                 UpdateControlFile();
8769                 LWLockRelease(ControlFileLock);
8770         }
8771
8772         /*
8773          * Let smgr prepare for checkpoint; this has to happen before we determine
8774          * the REDO pointer.  Note that smgr must not do anything that'd have to
8775          * be undone if we decide no checkpoint is needed.
8776          */
8777         smgrpreckpt();
8778
8779         /* Begin filling in the checkpoint WAL record */
8780         MemSet(&checkPoint, 0, sizeof(checkPoint));
8781         checkPoint.time = (pg_time_t) time(NULL);
8782
8783         /*
8784          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8785          * pointer. This allows us to begin accumulating changes to assemble our
8786          * starting snapshot of locks and transactions.
8787          */
8788         if (!shutdown && XLogStandbyInfoActive())
8789                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8790         else
8791                 checkPoint.oldestActiveXid = InvalidTransactionId;
8792
8793         /*
8794          * Get location of last important record before acquiring insert locks (as
8795          * GetLastImportantRecPtr() also locks WAL locks).
8796          */
8797         last_important_lsn = GetLastImportantRecPtr();
8798
8799         /*
8800          * We must block concurrent insertions while examining insert state to
8801          * determine the checkpoint REDO pointer.
8802          */
8803         WALInsertLockAcquireExclusive();
8804         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8805
8806         /*
8807          * If this isn't a shutdown or forced checkpoint, and if there has been no
8808          * WAL activity requiring a checkpoint, skip it.  The idea here is to
8809          * avoid inserting duplicate checkpoints when the system is idle.
8810          */
8811         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8812                                   CHECKPOINT_FORCE)) == 0)
8813         {
8814                 if (last_important_lsn == ControlFile->checkPoint)
8815                 {
8816                         WALInsertLockRelease();
8817                         LWLockRelease(CheckpointLock);
8818                         END_CRIT_SECTION();
8819                         ereport(DEBUG1,
8820                                         (errmsg("checkpoint skipped because system is idle")));
8821                         return;
8822                 }
8823         }
8824
8825         /*
8826          * An end-of-recovery checkpoint is created before anyone is allowed to
8827          * write WAL. To allow us to write the checkpoint record, temporarily
8828          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8829          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8830          */
8831         if (flags & CHECKPOINT_END_OF_RECOVERY)
8832                 LocalSetXLogInsertAllowed();
8833
8834         checkPoint.ThisTimeLineID = ThisTimeLineID;
8835         if (flags & CHECKPOINT_END_OF_RECOVERY)
8836                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8837         else
8838                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8839
8840         checkPoint.fullPageWrites = Insert->fullPageWrites;
8841
8842         /*
8843          * Compute new REDO record ptr = location of next XLOG record.
8844          *
8845          * NB: this is NOT necessarily where the checkpoint record itself will be,
8846          * since other backends may insert more XLOG records while we're off doing
8847          * the buffer flush work.  Those XLOG records are logically after the
8848          * checkpoint, even though physically before it.  Got that?
8849          */
8850         freespace = INSERT_FREESPACE(curInsert);
8851         if (freespace == 0)
8852         {
8853                 if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8854                         curInsert += SizeOfXLogLongPHD;
8855                 else
8856                         curInsert += SizeOfXLogShortPHD;
8857         }
8858         checkPoint.redo = curInsert;
8859
8860         /*
8861          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8862          * must be done while holding all the insertion locks.
8863          *
8864          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8865          * pointing past where it really needs to point.  This is okay; the only
8866          * consequence is that XLogInsert might back up whole buffers that it
8867          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8868          * XLogInserts that happen while we are dumping buffers must assume that
8869          * their buffer changes are not included in the checkpoint.
8870          */
8871         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8872
8873         /*
8874          * Now we can release the WAL insertion locks, allowing other xacts to
8875          * proceed while we are flushing disk buffers.
8876          */
8877         WALInsertLockRelease();
8878
8879         /* Update the info_lck-protected copy of RedoRecPtr as well */
8880         SpinLockAcquire(&XLogCtl->info_lck);
8881         XLogCtl->RedoRecPtr = checkPoint.redo;
8882         SpinLockRelease(&XLogCtl->info_lck);
8883
8884         /*
8885          * If enabled, log checkpoint start.  We postpone this until now so as not
8886          * to log anything if we decided to skip the checkpoint.
8887          */
8888         if (log_checkpoints)
8889                 LogCheckpointStart(flags, false);
8890
8891         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8892
8893         /*
8894          * Get the other info we need for the checkpoint record.
8895          *
8896          * We don't need to save oldestClogXid in the checkpoint, it only matters
8897          * for the short period in which clog is being truncated, and if we crash
8898          * during that we'll redo the clog truncation and fix up oldestClogXid
8899          * there.
8900          */
8901         LWLockAcquire(XidGenLock, LW_SHARED);
8902         checkPoint.nextXid = ShmemVariableCache->nextXid;
8903         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8904         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8905         LWLockRelease(XidGenLock);
8906
8907         LWLockAcquire(CommitTsLock, LW_SHARED);
8908         checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8909         checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8910         LWLockRelease(CommitTsLock);
8911
8912         /* Increase XID epoch if we've wrapped around since last checkpoint */
8913         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8914         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8915                 checkPoint.nextXidEpoch++;
8916
8917         LWLockAcquire(OidGenLock, LW_SHARED);
8918         checkPoint.nextOid = ShmemVariableCache->nextOid;
8919         if (!shutdown)
8920                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8921         LWLockRelease(OidGenLock);
8922
8923         MultiXactGetCheckptMulti(shutdown,
8924                                                          &checkPoint.nextMulti,
8925                                                          &checkPoint.nextMultiOffset,
8926                                                          &checkPoint.oldestMulti,
8927                                                          &checkPoint.oldestMultiDB);
8928
8929         /*
8930          * Having constructed the checkpoint record, ensure all shmem disk buffers
8931          * and commit-log buffers are flushed to disk.
8932          *
8933          * This I/O could fail for various reasons.  If so, we will fail to
8934          * complete the checkpoint, but there is no reason to force a system
8935          * panic. Accordingly, exit critical section while doing it.
8936          */
8937         END_CRIT_SECTION();
8938
8939         /*
8940          * In some cases there are groups of actions that must all occur on one
8941          * side or the other of a checkpoint record. Before flushing the
8942          * checkpoint record we must explicitly wait for any backend currently
8943          * performing those groups of actions.
8944          *
8945          * One example is end of transaction, so we must wait for any transactions
8946          * that are currently in commit critical sections.  If an xact inserted
8947          * its commit record into XLOG just before the REDO point, then a crash
8948          * restart from the REDO point would not replay that record, which means
8949          * that our flushing had better include the xact's update of pg_xact.  So
8950          * we wait till he's out of his commit critical section before proceeding.
8951          * See notes in RecordTransactionCommit().
8952          *
8953          * Because we've already released the insertion locks, this test is a bit
8954          * fuzzy: it is possible that we will wait for xacts we didn't really need
8955          * to wait for.  But the delay should be short and it seems better to make
8956          * checkpoint take a bit longer than to hold off insertions longer than
8957          * necessary. (In fact, the whole reason we have this issue is that xact.c
8958          * does commit record XLOG insertion and clog update as two separate steps
8959          * protected by different locks, but again that seems best on grounds of
8960          * minimizing lock contention.)
8961          *
8962          * A transaction that has not yet set delayChkpt when we look cannot be at
8963          * risk, since he's not inserted his commit record yet; and one that's
8964          * already cleared it is not at risk either, since he's done fixing clog
8965          * and we will correctly flush the update below.  So we cannot miss any
8966          * xacts we need to wait for.
8967          */
8968         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8969         if (nvxids > 0)
8970         {
8971                 do
8972                 {
8973                         pg_usleep(10000L);      /* wait for 10 msec */
8974                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8975         }
8976         pfree(vxids);
8977
8978         CheckPointGuts(checkPoint.redo, flags);
8979
8980         /*
8981          * Take a snapshot of running transactions and write this to WAL. This
8982          * allows us to reconstruct the state of running transactions during
8983          * archive recovery, if required. Skip, if this info disabled.
8984          *
8985          * If we are shutting down, or Startup process is completing crash
8986          * recovery we don't need to write running xact data.
8987          */
8988         if (!shutdown && XLogStandbyInfoActive())
8989                 LogStandbySnapshot();
8990
8991         START_CRIT_SECTION();
8992
8993         /*
8994          * Now insert the checkpoint record into XLOG.
8995          */
8996         XLogBeginInsert();
8997         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8998         recptr = XLogInsert(RM_XLOG_ID,
8999                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9000                                                 XLOG_CHECKPOINT_ONLINE);
9001
9002         XLogFlush(recptr);
9003
9004         /*
9005          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9006          * overwritten at next startup.  No-one should even try, this just allows
9007          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
9008          * to just temporarily disable writing until the system has exited
9009          * recovery.
9010          */
9011         if (shutdown)
9012         {
9013                 if (flags & CHECKPOINT_END_OF_RECOVERY)
9014                         LocalXLogInsertAllowed = -1;    /* return to "check" state */
9015                 else
9016                         LocalXLogInsertAllowed = 0; /* never again write WAL */
9017         }
9018
9019         /*
9020          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9021          * = end of actual checkpoint record.
9022          */
9023         if (shutdown && checkPoint.redo != ProcLastRecPtr)
9024                 ereport(PANIC,
9025                                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
9026
9027         /*
9028          * Remember the prior checkpoint's redo ptr for
9029          * UpdateCheckPointDistanceEstimate()
9030          */
9031         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9032
9033         /*
9034          * Update the control file.
9035          */
9036         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9037         if (shutdown)
9038                 ControlFile->state = DB_SHUTDOWNED;
9039         ControlFile->checkPoint = ProcLastRecPtr;
9040         ControlFile->checkPointCopy = checkPoint;
9041         ControlFile->time = (pg_time_t) time(NULL);
9042         /* crash recovery should always recover to the end of WAL */
9043         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9044         ControlFile->minRecoveryPointTLI = 0;
9045
9046         /*
9047          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9048          * unused on non-shutdown checkpoints, but seems useful to store it always
9049          * for debugging purposes.
9050          */
9051         SpinLockAcquire(&XLogCtl->ulsn_lck);
9052         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9053         SpinLockRelease(&XLogCtl->ulsn_lck);
9054
9055         UpdateControlFile();
9056         LWLockRelease(ControlFileLock);
9057
9058         /* Update shared-memory copy of checkpoint XID/epoch */
9059         SpinLockAcquire(&XLogCtl->info_lck);
9060         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9061         XLogCtl->ckptXid = checkPoint.nextXid;
9062         SpinLockRelease(&XLogCtl->info_lck);
9063
9064         /*
9065          * We are now done with critical updates; no need for system panic if we
9066          * have trouble while fooling with old log segments.
9067          */
9068         END_CRIT_SECTION();
9069
9070         /*
9071          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9072          */
9073         smgrpostckpt();
9074
9075         /*
9076          * Delete old log files and recycle them
9077          */
9078         if (PriorRedoPtr != InvalidXLogRecPtr)
9079         {
9080                 XLogSegNo       _logSegNo;
9081
9082                 /* Update the average distance between checkpoints. */
9083                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9084
9085                 /* Trim from the last checkpoint, not the last - 1 */
9086                 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9087                 KeepLogSeg(recptr, &_logSegNo);
9088                 _logSegNo--;
9089                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
9090         }
9091
9092         /*
9093          * Make more log segments if needed.  (Do this after recycling old log
9094          * segments, since that may supply some of the needed files.)
9095          */
9096         if (!shutdown)
9097                 PreallocXlogFiles(recptr);
9098
9099         /*
9100          * Truncate pg_subtrans if possible.  We can throw away all data before
9101          * the oldest XMIN of any running transaction.  No future transaction will
9102          * attempt to reference any pg_subtrans entry older than that (see Asserts
9103          * in subtrans.c).  During recovery, though, we mustn't do this because
9104          * StartupSUBTRANS hasn't been called yet.
9105          */
9106         if (!RecoveryInProgress())
9107                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9108
9109         /* Real work is done, but log and update stats before releasing lock. */
9110         LogCheckpointEnd(false);
9111
9112         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9113                                                                          NBuffers,
9114                                                                          CheckpointStats.ckpt_segs_added,
9115                                                                          CheckpointStats.ckpt_segs_removed,
9116                                                                          CheckpointStats.ckpt_segs_recycled);
9117
9118         LWLockRelease(CheckpointLock);
9119 }
9120
9121 /*
9122  * Mark the end of recovery in WAL though without running a full checkpoint.
9123  * We can expect that a restartpoint is likely to be in progress as we
9124  * do this, though we are unwilling to wait for it to complete. So be
9125  * careful to avoid taking the CheckpointLock anywhere here.
9126  *
9127  * CreateRestartPoint() allows for the case where recovery may end before
9128  * the restartpoint completes so there is no concern of concurrent behaviour.
9129  */
9130 static void
9131 CreateEndOfRecoveryRecord(void)
9132 {
9133         xl_end_of_recovery xlrec;
9134         XLogRecPtr      recptr;
9135
9136         /* sanity check */
9137         if (!RecoveryInProgress())
9138                 elog(ERROR, "can only be used to end recovery");
9139
9140         xlrec.end_time = GetCurrentTimestamp();
9141
9142         WALInsertLockAcquireExclusive();
9143         xlrec.ThisTimeLineID = ThisTimeLineID;
9144         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9145         WALInsertLockRelease();
9146
9147         LocalSetXLogInsertAllowed();
9148
9149         START_CRIT_SECTION();
9150
9151         XLogBeginInsert();
9152         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9153         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9154
9155         XLogFlush(recptr);
9156
9157         /*
9158          * Update the control file so that crash recovery can follow the timeline
9159          * changes to this point.
9160          */
9161         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9162         ControlFile->time = (pg_time_t) time(NULL);
9163         ControlFile->minRecoveryPoint = recptr;
9164         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9165         UpdateControlFile();
9166         LWLockRelease(ControlFileLock);
9167
9168         END_CRIT_SECTION();
9169
9170         LocalXLogInsertAllowed = -1;    /* return to "check" state */
9171 }
9172
9173 /*
9174  * Flush all data in shared memory to disk, and fsync
9175  *
9176  * This is the common code shared between regular checkpoints and
9177  * recovery restartpoints.
9178  */
9179 static void
9180 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9181 {
9182         CheckPointCLOG();
9183         CheckPointCommitTs();
9184         CheckPointSUBTRANS();
9185         CheckPointMultiXact();
9186         CheckPointPredicate();
9187         CheckPointRelationMap();
9188         CheckPointReplicationSlots();
9189         CheckPointSnapBuild();
9190         CheckPointLogicalRewriteHeap();
9191         CheckPointBuffers(flags);       /* performs all required fsyncs */
9192         CheckPointReplicationOrigin();
9193         /* We deliberately delay 2PC checkpointing as long as possible */
9194         CheckPointTwoPhase(checkPointRedo);
9195 }
9196
9197 /*
9198  * Save a checkpoint for recovery restart if appropriate
9199  *
9200  * This function is called each time a checkpoint record is read from XLOG.
9201  * It must determine whether the checkpoint represents a safe restartpoint or
9202  * not.  If so, the checkpoint record is stashed in shared memory so that
9203  * CreateRestartPoint can consult it.  (Note that the latter function is
9204  * executed by the checkpointer, while this one will be executed by the
9205  * startup process.)
9206  */
9207 static void
9208 RecoveryRestartPoint(const CheckPoint *checkPoint)
9209 {
9210         /*
9211          * Also refrain from creating a restartpoint if we have seen any
9212          * references to non-existent pages. Restarting recovery from the
9213          * restartpoint would not see the references, so we would lose the
9214          * cross-check that the pages belonged to a relation that was dropped
9215          * later.
9216          */
9217         if (XLogHaveInvalidPages())
9218         {
9219                 elog(trace_recovery(DEBUG2),
9220                          "could not record restart point at %X/%X because there "
9221                          "are unresolved references to invalid pages",
9222                          (uint32) (checkPoint->redo >> 32),
9223                          (uint32) checkPoint->redo);
9224                 return;
9225         }
9226
9227         /*
9228          * Copy the checkpoint record to shared memory, so that checkpointer can
9229          * work out the next time it wants to perform a restartpoint.
9230          */
9231         SpinLockAcquire(&XLogCtl->info_lck);
9232         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9233         XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9234         XLogCtl->lastCheckPoint = *checkPoint;
9235         SpinLockRelease(&XLogCtl->info_lck);
9236 }
9237
9238 /*
9239  * Establish a restartpoint if possible.
9240  *
9241  * This is similar to CreateCheckPoint, but is used during WAL recovery
9242  * to establish a point from which recovery can roll forward without
9243  * replaying the entire recovery log.
9244  *
9245  * Returns true if a new restartpoint was established. We can only establish
9246  * a restartpoint if we have replayed a safe checkpoint record since last
9247  * restartpoint.
9248  */
9249 bool
9250 CreateRestartPoint(int flags)
9251 {
9252         XLogRecPtr      lastCheckPointRecPtr;
9253         XLogRecPtr      lastCheckPointEndPtr;
9254         CheckPoint      lastCheckPoint;
9255         XLogRecPtr      PriorRedoPtr;
9256         TimestampTz xtime;
9257
9258         /*
9259          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9260          * happens at a time.
9261          */
9262         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9263
9264         /* Get a local copy of the last safe checkpoint record. */
9265         SpinLockAcquire(&XLogCtl->info_lck);
9266         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9267         lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9268         lastCheckPoint = XLogCtl->lastCheckPoint;
9269         SpinLockRelease(&XLogCtl->info_lck);
9270
9271         /*
9272          * Check that we're still in recovery mode. It's ok if we exit recovery
9273          * mode after this check, the restart point is valid anyway.
9274          */
9275         if (!RecoveryInProgress())
9276         {
9277                 ereport(DEBUG2,
9278                                 (errmsg("skipping restartpoint, recovery has already ended")));
9279                 LWLockRelease(CheckpointLock);
9280                 return false;
9281         }
9282
9283         /*
9284          * If the last checkpoint record we've replayed is already our last
9285          * restartpoint, we can't perform a new restart point. We still update
9286          * minRecoveryPoint in that case, so that if this is a shutdown restart
9287          * point, we won't start up earlier than before. That's not strictly
9288          * necessary, but when hot standby is enabled, it would be rather weird if
9289          * the database opened up for read-only connections at a point-in-time
9290          * before the last shutdown. Such time travel is still possible in case of
9291          * immediate shutdown, though.
9292          *
9293          * We don't explicitly advance minRecoveryPoint when we do create a
9294          * restartpoint. It's assumed that flushing the buffers will do that as a
9295          * side-effect.
9296          */
9297         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9298                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9299         {
9300                 ereport(DEBUG2,
9301                                 (errmsg("skipping restartpoint, already performed at %X/%X",
9302                                                 (uint32) (lastCheckPoint.redo >> 32),
9303                                                 (uint32) lastCheckPoint.redo)));
9304
9305                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9306                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9307                 {
9308                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9309                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9310                         ControlFile->time = (pg_time_t) time(NULL);
9311                         UpdateControlFile();
9312                         LWLockRelease(ControlFileLock);
9313                 }
9314                 LWLockRelease(CheckpointLock);
9315                 return false;
9316         }
9317
9318         /*
9319          * Update the shared RedoRecPtr so that the startup process can calculate
9320          * the number of segments replayed since last restartpoint, and request a
9321          * restartpoint if it exceeds CheckPointSegments.
9322          *
9323          * Like in CreateCheckPoint(), hold off insertions to update it, although
9324          * during recovery this is just pro forma, because no WAL insertions are
9325          * happening.
9326          */
9327         WALInsertLockAcquireExclusive();
9328         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9329         WALInsertLockRelease();
9330
9331         /* Also update the info_lck-protected copy */
9332         SpinLockAcquire(&XLogCtl->info_lck);
9333         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9334         SpinLockRelease(&XLogCtl->info_lck);
9335
9336         /*
9337          * Prepare to accumulate statistics.
9338          *
9339          * Note: because it is possible for log_checkpoints to change while a
9340          * checkpoint proceeds, we always accumulate stats, even if
9341          * log_checkpoints is currently off.
9342          */
9343         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9344         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9345
9346         if (log_checkpoints)
9347                 LogCheckpointStart(flags, true);
9348
9349         CheckPointGuts(lastCheckPoint.redo, flags);
9350
9351         /*
9352          * Remember the prior checkpoint's redo ptr for
9353          * UpdateCheckPointDistanceEstimate()
9354          */
9355         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9356
9357         /*
9358          * Update pg_control, using current time.  Check that it still shows
9359          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9360          * this is a quick hack to make sure nothing really bad happens if somehow
9361          * we get here after the end-of-recovery checkpoint.
9362          */
9363         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9364         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9365                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9366         {
9367                 ControlFile->checkPoint = lastCheckPointRecPtr;
9368                 ControlFile->checkPointCopy = lastCheckPoint;
9369                 ControlFile->time = (pg_time_t) time(NULL);
9370
9371                 /*
9372                  * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9373                  * this will have happened already while writing out dirty buffers,
9374                  * but not necessarily - e.g. because no buffers were dirtied.  We do
9375                  * this because a non-exclusive base backup uses minRecoveryPoint to
9376                  * determine which WAL files must be included in the backup, and the
9377                  * file (or files) containing the checkpoint record must be included,
9378                  * at a minimum. Note that for an ordinary restart of recovery there's
9379                  * no value in having the minimum recovery point any earlier than this
9380                  * anyway, because redo will begin just after the checkpoint record.
9381                  */
9382                 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9383                 {
9384                         ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9385                         ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9386
9387                         /* update local copy */
9388                         minRecoveryPoint = ControlFile->minRecoveryPoint;
9389                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9390                 }
9391                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9392                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9393                 UpdateControlFile();
9394         }
9395         LWLockRelease(ControlFileLock);
9396
9397         /*
9398          * Delete old log files (those no longer needed even for previous
9399          * checkpoint/restartpoint) to prevent the disk holding the xlog from
9400          * growing full.
9401          */
9402         if (PriorRedoPtr != InvalidXLogRecPtr)
9403         {
9404                 XLogRecPtr      receivePtr;
9405                 XLogRecPtr      replayPtr;
9406                 TimeLineID      replayTLI;
9407                 XLogRecPtr      endptr;
9408                 XLogSegNo       _logSegNo;
9409
9410                 /* Update the average distance between checkpoints/restartpoints. */
9411                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9412
9413                 XLByteToSeg(PriorRedoPtr, _logSegNo, wal_segment_size);
9414
9415                 /*
9416                  * Get the current end of xlog replayed or received, whichever is
9417                  * later.
9418                  */
9419                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9420                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9421                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9422
9423                 KeepLogSeg(endptr, &_logSegNo);
9424                 _logSegNo--;
9425
9426                 /*
9427                  * Try to recycle segments on a useful timeline. If we've been
9428                  * promoted since the beginning of this restartpoint, use the new
9429                  * timeline chosen at end of recovery (RecoveryInProgress() sets
9430                  * ThisTimeLineID in that case). If we're still in recovery, use the
9431                  * timeline we're currently replaying.
9432                  *
9433                  * There is no guarantee that the WAL segments will be useful on the
9434                  * current timeline; if recovery proceeds to a new timeline right
9435                  * after this, the pre-allocated WAL segments on this timeline will
9436                  * not be used, and will go wasted until recycled on the next
9437                  * restartpoint. We'll live with that.
9438                  */
9439                 if (RecoveryInProgress())
9440                         ThisTimeLineID = replayTLI;
9441
9442                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
9443
9444                 /*
9445                  * Make more log segments if needed.  (Do this after recycling old log
9446                  * segments, since that may supply some of the needed files.)
9447                  */
9448                 PreallocXlogFiles(endptr);
9449
9450                 /*
9451                  * ThisTimeLineID is normally not set when we're still in recovery.
9452                  * However, recycling/preallocating segments above needed
9453                  * ThisTimeLineID to determine which timeline to install the segments
9454                  * on. Reset it now, to restore the normal state of affairs for
9455                  * debugging purposes.
9456                  */
9457                 if (RecoveryInProgress())
9458                         ThisTimeLineID = 0;
9459         }
9460
9461         /*
9462          * Truncate pg_subtrans if possible.  We can throw away all data before
9463          * the oldest XMIN of any running transaction.  No future transaction will
9464          * attempt to reference any pg_subtrans entry older than that (see Asserts
9465          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9466          * this because StartupSUBTRANS hasn't been called yet.
9467          */
9468         if (EnableHotStandby)
9469                 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9470
9471         /* Real work is done, but log and update before releasing lock. */
9472         LogCheckpointEnd(true);
9473
9474         xtime = GetLatestXTime();
9475         ereport((log_checkpoints ? LOG : DEBUG2),
9476                         (errmsg("recovery restart point at %X/%X",
9477                                         (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9478                          xtime ? errdetail("Last completed transaction was at log time %s.",
9479                                                            timestamptz_to_str(xtime)) : 0));
9480
9481         LWLockRelease(CheckpointLock);
9482
9483         /*
9484          * Finally, execute archive_cleanup_command, if any.
9485          */
9486         if (XLogCtl->archiveCleanupCommand[0])
9487                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9488                                                            "archive_cleanup_command",
9489                                                            false);
9490
9491         return true;
9492 }
9493
9494 /*
9495  * Retreat *logSegNo to the last segment that we need to retain because of
9496  * either wal_keep_segments or replication slots.
9497  *
9498  * This is calculated by subtracting wal_keep_segments from the given xlog
9499  * location, recptr and by making sure that that result is below the
9500  * requirement of replication slots.
9501  */
9502 static void
9503 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9504 {
9505         XLogSegNo       segno;
9506         XLogRecPtr      keep;
9507
9508         XLByteToSeg(recptr, segno, wal_segment_size);
9509         keep = XLogGetReplicationSlotMinimumLSN();
9510
9511         /* compute limit for wal_keep_segments first */
9512         if (wal_keep_segments > 0)
9513         {
9514                 /* avoid underflow, don't go below 1 */
9515                 if (segno <= wal_keep_segments)
9516                         segno = 1;
9517                 else
9518                         segno = segno - wal_keep_segments;
9519         }
9520
9521         /* then check whether slots limit removal further */
9522         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9523         {
9524                 XLogSegNo       slotSegNo;
9525
9526                 XLByteToSeg(keep, slotSegNo, wal_segment_size);
9527
9528                 if (slotSegNo <= 0)
9529                         segno = 1;
9530                 else if (slotSegNo < segno)
9531                         segno = slotSegNo;
9532         }
9533
9534         /* don't delete WAL segments newer than the calculated segment */
9535         if (segno < *logSegNo)
9536                 *logSegNo = segno;
9537 }
9538
9539 /*
9540  * Write a NEXTOID log record
9541  */
9542 void
9543 XLogPutNextOid(Oid nextOid)
9544 {
9545         XLogBeginInsert();
9546         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9547         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9548
9549         /*
9550          * We need not flush the NEXTOID record immediately, because any of the
9551          * just-allocated OIDs could only reach disk as part of a tuple insert or
9552          * update that would have its own XLOG record that must follow the NEXTOID
9553          * record.  Therefore, the standard buffer LSN interlock applied to those
9554          * records will ensure no such OID reaches disk before the NEXTOID record
9555          * does.
9556          *
9557          * Note, however, that the above statement only covers state "within" the
9558          * database.  When we use a generated OID as a file or directory name, we
9559          * are in a sense violating the basic WAL rule, because that filesystem
9560          * change may reach disk before the NEXTOID WAL record does.  The impact
9561          * of this is that if a database crash occurs immediately afterward, we
9562          * might after restart re-generate the same OID and find that it conflicts
9563          * with the leftover file or directory.  But since for safety's sake we
9564          * always loop until finding a nonconflicting filename, this poses no real
9565          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9566          */
9567 }
9568
9569 /*
9570  * Write an XLOG SWITCH record.
9571  *
9572  * Here we just blindly issue an XLogInsert request for the record.
9573  * All the magic happens inside XLogInsert.
9574  *
9575  * The return value is either the end+1 address of the switch record,
9576  * or the end+1 address of the prior segment if we did not need to
9577  * write a switch record because we are already at segment start.
9578  */
9579 XLogRecPtr
9580 RequestXLogSwitch(bool mark_unimportant)
9581 {
9582         XLogRecPtr      RecPtr;
9583
9584         /* XLOG SWITCH has no data */
9585         XLogBeginInsert();
9586
9587         if (mark_unimportant)
9588                 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9589         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9590
9591         return RecPtr;
9592 }
9593
9594 /*
9595  * Write a RESTORE POINT record
9596  */
9597 XLogRecPtr
9598 XLogRestorePoint(const char *rpName)
9599 {
9600         XLogRecPtr      RecPtr;
9601         xl_restore_point xlrec;
9602
9603         xlrec.rp_time = GetCurrentTimestamp();
9604         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9605
9606         XLogBeginInsert();
9607         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9608
9609         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9610
9611         ereport(LOG,
9612                         (errmsg("restore point \"%s\" created at %X/%X",
9613                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9614
9615         return RecPtr;
9616 }
9617
9618 /*
9619  * Check if any of the GUC parameters that are critical for hot standby
9620  * have changed, and update the value in pg_control file if necessary.
9621  */
9622 static void
9623 XLogReportParameters(void)
9624 {
9625         if (wal_level != ControlFile->wal_level ||
9626                 wal_log_hints != ControlFile->wal_log_hints ||
9627                 MaxConnections != ControlFile->MaxConnections ||
9628                 max_worker_processes != ControlFile->max_worker_processes ||
9629                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9630                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9631                 track_commit_timestamp != ControlFile->track_commit_timestamp)
9632         {
9633                 /*
9634                  * The change in number of backend slots doesn't need to be WAL-logged
9635                  * if archiving is not enabled, as you can't start archive recovery
9636                  * with wal_level=minimal anyway. We don't really care about the
9637                  * values in pg_control either if wal_level=minimal, but seems better
9638                  * to keep them up-to-date to avoid confusion.
9639                  */
9640                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9641                 {
9642                         xl_parameter_change xlrec;
9643                         XLogRecPtr      recptr;
9644
9645                         xlrec.MaxConnections = MaxConnections;
9646                         xlrec.max_worker_processes = max_worker_processes;
9647                         xlrec.max_prepared_xacts = max_prepared_xacts;
9648                         xlrec.max_locks_per_xact = max_locks_per_xact;
9649                         xlrec.wal_level = wal_level;
9650                         xlrec.wal_log_hints = wal_log_hints;
9651                         xlrec.track_commit_timestamp = track_commit_timestamp;
9652
9653                         XLogBeginInsert();
9654                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9655
9656                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9657                         XLogFlush(recptr);
9658                 }
9659
9660                 ControlFile->MaxConnections = MaxConnections;
9661                 ControlFile->max_worker_processes = max_worker_processes;
9662                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9663                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9664                 ControlFile->wal_level = wal_level;
9665                 ControlFile->wal_log_hints = wal_log_hints;
9666                 ControlFile->track_commit_timestamp = track_commit_timestamp;
9667                 UpdateControlFile();
9668         }
9669 }
9670
9671 /*
9672  * Update full_page_writes in shared memory, and write an
9673  * XLOG_FPW_CHANGE record if necessary.
9674  *
9675  * Note: this function assumes there is no other process running
9676  * concurrently that could update it.
9677  */
9678 void
9679 UpdateFullPageWrites(void)
9680 {
9681         XLogCtlInsert *Insert = &XLogCtl->Insert;
9682
9683         /*
9684          * Do nothing if full_page_writes has not been changed.
9685          *
9686          * It's safe to check the shared full_page_writes without the lock,
9687          * because we assume that there is no concurrently running process which
9688          * can update it.
9689          */
9690         if (fullPageWrites == Insert->fullPageWrites)
9691                 return;
9692
9693         START_CRIT_SECTION();
9694
9695         /*
9696          * It's always safe to take full page images, even when not strictly
9697          * required, but not the other round. So if we're setting full_page_writes
9698          * to true, first set it true and then write the WAL record. If we're
9699          * setting it to false, first write the WAL record and then set the global
9700          * flag.
9701          */
9702         if (fullPageWrites)
9703         {
9704                 WALInsertLockAcquireExclusive();
9705                 Insert->fullPageWrites = true;
9706                 WALInsertLockRelease();
9707         }
9708
9709         /*
9710          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9711          * full_page_writes during archive recovery, if required.
9712          */
9713         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9714         {
9715                 XLogBeginInsert();
9716                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9717
9718                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9719         }
9720
9721         if (!fullPageWrites)
9722         {
9723                 WALInsertLockAcquireExclusive();
9724                 Insert->fullPageWrites = false;
9725                 WALInsertLockRelease();
9726         }
9727         END_CRIT_SECTION();
9728 }
9729
9730 /*
9731  * Check that it's OK to switch to new timeline during recovery.
9732  *
9733  * 'lsn' is the address of the shutdown checkpoint record we're about to
9734  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9735  */
9736 static void
9737 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9738 {
9739         /* Check that the record agrees on what the current (old) timeline is */
9740         if (prevTLI != ThisTimeLineID)
9741                 ereport(PANIC,
9742                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9743                                                 prevTLI, ThisTimeLineID)));
9744
9745         /*
9746          * The new timeline better be in the list of timelines we expect to see,
9747          * according to the timeline history. It should also not decrease.
9748          */
9749         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9750                 ereport(PANIC,
9751                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9752                                                 newTLI, ThisTimeLineID)));
9753
9754         /*
9755          * If we have not yet reached min recovery point, and we're about to
9756          * switch to a timeline greater than the timeline of the min recovery
9757          * point: trouble. After switching to the new timeline, we could not
9758          * possibly visit the min recovery point on the correct timeline anymore.
9759          * This can happen if there is a newer timeline in the archive that
9760          * branched before the timeline the min recovery point is on, and you
9761          * attempt to do PITR to the new timeline.
9762          */
9763         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9764                 lsn < minRecoveryPoint &&
9765                 newTLI > minRecoveryPointTLI)
9766                 ereport(PANIC,
9767                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9768                                                 newTLI,
9769                                                 (uint32) (minRecoveryPoint >> 32),
9770                                                 (uint32) minRecoveryPoint,
9771                                                 minRecoveryPointTLI)));
9772
9773         /* Looks good */
9774 }
9775
9776 /*
9777  * XLOG resource manager's routines
9778  *
9779  * Definitions of info values are in include/catalog/pg_control.h, though
9780  * not all record types are related to control file updates.
9781  */
9782 void
9783 xlog_redo(XLogReaderState *record)
9784 {
9785         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9786         XLogRecPtr      lsn = record->EndRecPtr;
9787
9788         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9789         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9790                    !XLogRecHasAnyBlockRefs(record));
9791
9792         if (info == XLOG_NEXTOID)
9793         {
9794                 Oid                     nextOid;
9795
9796                 /*
9797                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9798                  * and the recorded nextOid, but that fails if the OID counter wraps
9799                  * around.  Since no OID allocation should be happening during replay
9800                  * anyway, better to just believe the record exactly.  We still take
9801                  * OidGenLock while setting the variable, just in case.
9802                  */
9803                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9804                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9805                 ShmemVariableCache->nextOid = nextOid;
9806                 ShmemVariableCache->oidCount = 0;
9807                 LWLockRelease(OidGenLock);
9808         }
9809         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9810         {
9811                 CheckPoint      checkPoint;
9812
9813                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9814                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9815                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9816                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9817                 LWLockRelease(XidGenLock);
9818                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9819                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9820                 ShmemVariableCache->oidCount = 0;
9821                 LWLockRelease(OidGenLock);
9822                 MultiXactSetNextMXact(checkPoint.nextMulti,
9823                                                           checkPoint.nextMultiOffset);
9824
9825                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9826                                                            checkPoint.oldestMultiDB);
9827
9828                 /*
9829                  * No need to set oldestClogXid here as well; it'll be set when we
9830                  * redo an xl_clog_truncate if it changed since initialization.
9831                  */
9832                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9833
9834                 /*
9835                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9836                  * record, the backup was canceled and the end-of-backup record will
9837                  * never arrive.
9838                  */
9839                 if (ArchiveRecoveryRequested &&
9840                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9841                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9842                         ereport(PANIC,
9843                                         (errmsg("online backup was canceled, recovery cannot continue")));
9844
9845                 /*
9846                  * If we see a shutdown checkpoint, we know that nothing was running
9847                  * on the master at this point. So fake-up an empty running-xacts
9848                  * record and use that here and now. Recover additional standby state
9849                  * for prepared transactions.
9850                  */
9851                 if (standbyState >= STANDBY_INITIALIZED)
9852                 {
9853                         TransactionId *xids;
9854                         int                     nxids;
9855                         TransactionId oldestActiveXID;
9856                         TransactionId latestCompletedXid;
9857                         RunningTransactionsData running;
9858
9859                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9860
9861                         /*
9862                          * Construct a RunningTransactions snapshot representing a shut
9863                          * down server, with only prepared transactions still alive. We're
9864                          * never overflowed at this point because all subxids are listed
9865                          * with their parent prepared transactions.
9866                          */
9867                         running.xcnt = nxids;
9868                         running.subxcnt = 0;
9869                         running.subxid_overflow = false;
9870                         running.nextXid = checkPoint.nextXid;
9871                         running.oldestRunningXid = oldestActiveXID;
9872                         latestCompletedXid = checkPoint.nextXid;
9873                         TransactionIdRetreat(latestCompletedXid);
9874                         Assert(TransactionIdIsNormal(latestCompletedXid));
9875                         running.latestCompletedXid = latestCompletedXid;
9876                         running.xids = xids;
9877
9878                         ProcArrayApplyRecoveryInfo(&running);
9879
9880                         StandbyRecoverPreparedTransactions();
9881                 }
9882
9883                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9884                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9885                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9886
9887                 /* Update shared-memory copy of checkpoint XID/epoch */
9888                 SpinLockAcquire(&XLogCtl->info_lck);
9889                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9890                 XLogCtl->ckptXid = checkPoint.nextXid;
9891                 SpinLockRelease(&XLogCtl->info_lck);
9892
9893                 /*
9894                  * We should've already switched to the new TLI before replaying this
9895                  * record.
9896                  */
9897                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9898                         ereport(PANIC,
9899                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9900                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9901
9902                 RecoveryRestartPoint(&checkPoint);
9903         }
9904         else if (info == XLOG_CHECKPOINT_ONLINE)
9905         {
9906                 CheckPoint      checkPoint;
9907
9908                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9909                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9910                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9911                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9912                                                                   checkPoint.nextXid))
9913                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9914                 LWLockRelease(XidGenLock);
9915
9916                 /*
9917                  * We ignore the nextOid counter in an ONLINE checkpoint, preferring
9918                  * to track OID assignment through XLOG_NEXTOID records.  The nextOid
9919                  * counter is from the start of the checkpoint and might well be stale
9920                  * compared to later XLOG_NEXTOID records.  We could try to take the
9921                  * maximum of the nextOid counter and our latest value, but since
9922                  * there's no particular guarantee about the speed with which the OID
9923                  * counter wraps around, that's a risky thing to do.  In any case,
9924                  * users of the nextOid counter are required to avoid assignment of
9925                  * duplicates, so that a somewhat out-of-date value should be safe.
9926                  */
9927
9928                 /* Handle multixact */
9929                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9930                                                                   checkPoint.nextMultiOffset);
9931
9932                 /*
9933                  * NB: This may perform multixact truncation when replaying WAL
9934                  * generated by an older primary.
9935                  */
9936                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9937                                                            checkPoint.oldestMultiDB);
9938                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9939                                                                   checkPoint.oldestXid))
9940                         SetTransactionIdLimit(checkPoint.oldestXid,
9941                                                                   checkPoint.oldestXidDB);
9942                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9943                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9944                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9945
9946                 /* Update shared-memory copy of checkpoint XID/epoch */
9947                 SpinLockAcquire(&XLogCtl->info_lck);
9948                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9949                 XLogCtl->ckptXid = checkPoint.nextXid;
9950                 SpinLockRelease(&XLogCtl->info_lck);
9951
9952                 /* TLI should not change in an on-line checkpoint */
9953                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9954                         ereport(PANIC,
9955                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9956                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9957
9958                 RecoveryRestartPoint(&checkPoint);
9959         }
9960         else if (info == XLOG_END_OF_RECOVERY)
9961         {
9962                 xl_end_of_recovery xlrec;
9963
9964                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9965
9966                 /*
9967                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9968                  * but this case is rarer and harder to test, so the benefit doesn't
9969                  * outweigh the potential extra cost of maintenance.
9970                  */
9971
9972                 /*
9973                  * We should've already switched to the new TLI before replaying this
9974                  * record.
9975                  */
9976                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9977                         ereport(PANIC,
9978                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9979                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9980         }
9981         else if (info == XLOG_NOOP)
9982         {
9983                 /* nothing to do here */
9984         }
9985         else if (info == XLOG_SWITCH)
9986         {
9987                 /* nothing to do here */
9988         }
9989         else if (info == XLOG_RESTORE_POINT)
9990         {
9991                 /* nothing to do here */
9992         }
9993         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
9994         {
9995                 Buffer          buffer;
9996
9997                 /*
9998                  * Full-page image (FPI) records contain nothing else but a backup
9999                  * block. The block reference must include a full-page image -
10000                  * otherwise there would be no point in this record.
10001                  *
10002                  * No recovery conflicts are generated by these generic records - if a
10003                  * resource manager needs to generate conflicts, it has to define a
10004                  * separate WAL record type and redo routine.
10005                  *
10006                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10007                  * WAL- logged because of a hint bit update. They are only generated
10008                  * when checksums are enabled. There is no difference in handling
10009                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10010                  * code just to distinguish them for statistics purposes.
10011                  */
10012                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
10013                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10014                 UnlockReleaseBuffer(buffer);
10015         }
10016         else if (info == XLOG_BACKUP_END)
10017         {
10018                 XLogRecPtr      startpoint;
10019
10020                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10021
10022                 if (ControlFile->backupStartPoint == startpoint)
10023                 {
10024                         /*
10025                          * We have reached the end of base backup, the point where
10026                          * pg_stop_backup() was done. The data on disk is now consistent.
10027                          * Reset backupStartPoint, and update minRecoveryPoint to make
10028                          * sure we don't allow starting up at an earlier point even if
10029                          * recovery is stopped and restarted soon after this.
10030                          */
10031                         elog(DEBUG1, "end of backup reached");
10032
10033                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10034
10035                         if (ControlFile->minRecoveryPoint < lsn)
10036                         {
10037                                 ControlFile->minRecoveryPoint = lsn;
10038                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10039                         }
10040                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
10041                         ControlFile->backupEndRequired = false;
10042                         UpdateControlFile();
10043
10044                         LWLockRelease(ControlFileLock);
10045                 }
10046         }
10047         else if (info == XLOG_PARAMETER_CHANGE)
10048         {
10049                 xl_parameter_change xlrec;
10050
10051                 /* Update our copy of the parameters in pg_control */
10052                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10053
10054                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10055                 ControlFile->MaxConnections = xlrec.MaxConnections;
10056                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10057                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10058                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10059                 ControlFile->wal_level = xlrec.wal_level;
10060                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10061
10062                 /*
10063                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
10064                  * recover back up to this point before allowing hot standby again.
10065                  * This is important if the max_* settings are decreased, to ensure
10066                  * you don't run queries against the WAL preceding the change. The
10067                  * local copies cannot be updated as long as crash recovery is
10068                  * happening and we expect all the WAL to be replayed.
10069                  */
10070                 if (InArchiveRecovery)
10071                 {
10072                         minRecoveryPoint = ControlFile->minRecoveryPoint;
10073                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10074                 }
10075                 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10076                 {
10077                         ControlFile->minRecoveryPoint = lsn;
10078                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10079                 }
10080
10081                 CommitTsParameterChange(xlrec.track_commit_timestamp,
10082                                                                 ControlFile->track_commit_timestamp);
10083                 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10084
10085                 UpdateControlFile();
10086                 LWLockRelease(ControlFileLock);
10087
10088                 /* Check to see if any changes to max_connections give problems */
10089                 CheckRequiredParameterValues();
10090         }
10091         else if (info == XLOG_FPW_CHANGE)
10092         {
10093                 bool            fpw;
10094
10095                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10096
10097                 /*
10098                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10099                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
10100                  * full_page_writes has been disabled during online backup.
10101                  */
10102                 if (!fpw)
10103                 {
10104                         SpinLockAcquire(&XLogCtl->info_lck);
10105                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10106                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10107                         SpinLockRelease(&XLogCtl->info_lck);
10108                 }
10109
10110                 /* Keep track of full_page_writes */
10111                 lastFullPageWrites = fpw;
10112         }
10113 }
10114
10115 #ifdef WAL_DEBUG
10116
10117 static void
10118 xlog_outrec(StringInfo buf, XLogReaderState *record)
10119 {
10120         int                     block_id;
10121
10122         appendStringInfo(buf, "prev %X/%X; xid %u",
10123                                          (uint32) (XLogRecGetPrev(record) >> 32),
10124                                          (uint32) XLogRecGetPrev(record),
10125                                          XLogRecGetXid(record));
10126
10127         appendStringInfo(buf, "; len %u",
10128                                          XLogRecGetDataLen(record));
10129
10130         /* decode block references */
10131         for (block_id = 0; block_id <= record->max_block_id; block_id++)
10132         {
10133                 RelFileNode rnode;
10134                 ForkNumber      forknum;
10135                 BlockNumber blk;
10136
10137                 if (!XLogRecHasBlockRef(record, block_id))
10138                         continue;
10139
10140                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10141                 if (forknum != MAIN_FORKNUM)
10142                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10143                                                          block_id,
10144                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
10145                                                          forknum,
10146                                                          blk);
10147                 else
10148                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10149                                                          block_id,
10150                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
10151                                                          blk);
10152                 if (XLogRecHasBlockImage(record, block_id))
10153                         appendStringInfoString(buf, " FPW");
10154         }
10155 }
10156 #endif                                                  /* WAL_DEBUG */
10157
10158 /*
10159  * Returns a string describing an XLogRecord, consisting of its identity
10160  * optionally followed by a colon, a space, and a further description.
10161  */
10162 static void
10163 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10164 {
10165         RmgrId          rmid = XLogRecGetRmid(record);
10166         uint8           info = XLogRecGetInfo(record);
10167         const char *id;
10168
10169         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10170         appendStringInfoChar(buf, '/');
10171
10172         id = RmgrTable[rmid].rm_identify(info);
10173         if (id == NULL)
10174                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10175         else
10176                 appendStringInfo(buf, "%s: ", id);
10177
10178         RmgrTable[rmid].rm_desc(buf, record);
10179 }
10180
10181
10182 /*
10183  * Return the (possible) sync flag used for opening a file, depending on the
10184  * value of the GUC wal_sync_method.
10185  */
10186 static int
10187 get_sync_bit(int method)
10188 {
10189         int                     o_direct_flag = 0;
10190
10191         /* If fsync is disabled, never open in sync mode */
10192         if (!enableFsync)
10193                 return 0;
10194
10195         /*
10196          * Optimize writes by bypassing kernel cache with O_DIRECT when using
10197          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
10198          * disabled, otherwise the archive command or walsender process will read
10199          * the WAL soon after writing it, which is guaranteed to cause a physical
10200          * read if we bypassed the kernel cache. We also skip the
10201          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10202          * reason.
10203          *
10204          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10205          * written by walreceiver is normally read by the startup process soon
10206          * after its written. Also, walreceiver performs unaligned writes, which
10207          * don't work with O_DIRECT, so it is required for correctness too.
10208          */
10209         if (!XLogIsNeeded() && !AmWalReceiverProcess())
10210                 o_direct_flag = PG_O_DIRECT;
10211
10212         switch (method)
10213         {
10214                         /*
10215                          * enum values for all sync options are defined even if they are
10216                          * not supported on the current platform.  But if not, they are
10217                          * not included in the enum option array, and therefore will never
10218                          * be seen here.
10219                          */
10220                 case SYNC_METHOD_FSYNC:
10221                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10222                 case SYNC_METHOD_FDATASYNC:
10223                         return 0;
10224 #ifdef OPEN_SYNC_FLAG
10225                 case SYNC_METHOD_OPEN:
10226                         return OPEN_SYNC_FLAG | o_direct_flag;
10227 #endif
10228 #ifdef OPEN_DATASYNC_FLAG
10229                 case SYNC_METHOD_OPEN_DSYNC:
10230                         return OPEN_DATASYNC_FLAG | o_direct_flag;
10231 #endif
10232                 default:
10233                         /* can't happen (unless we are out of sync with option array) */
10234                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
10235                         return 0;                       /* silence warning */
10236         }
10237 }
10238
10239 /*
10240  * GUC support
10241  */
10242 void
10243 assign_xlog_sync_method(int new_sync_method, void *extra)
10244 {
10245         if (sync_method != new_sync_method)
10246         {
10247                 /*
10248                  * To ensure that no blocks escape unsynced, force an fsync on the
10249                  * currently open log segment (if any).  Also, if the open flag is
10250                  * changing, close the log file so it will be reopened (with new flag
10251                  * bit) at next use.
10252                  */
10253                 if (openLogFile >= 0)
10254                 {
10255                         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10256                         if (pg_fsync(openLogFile) != 0)
10257                                 ereport(PANIC,
10258                                                 (errcode_for_file_access(),
10259                                                  errmsg("could not fsync file \"%s\": %m",
10260                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10261                         pgstat_report_wait_end();
10262                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10263                                 XLogFileClose();
10264                 }
10265         }
10266 }
10267
10268
10269 /*
10270  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10271  *
10272  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10273  * 'log' and 'seg' are for error reporting purposes.
10274  */
10275 void
10276 issue_xlog_fsync(int fd, XLogSegNo segno)
10277 {
10278         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10279         switch (sync_method)
10280         {
10281                 case SYNC_METHOD_FSYNC:
10282                         if (pg_fsync_no_writethrough(fd) != 0)
10283                                 ereport(PANIC,
10284                                                 (errcode_for_file_access(),
10285                                                  errmsg("could not fsync file \"%s\": %m",
10286                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10287                         break;
10288 #ifdef HAVE_FSYNC_WRITETHROUGH
10289                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10290                         if (pg_fsync_writethrough(fd) != 0)
10291                                 ereport(PANIC,
10292                                                 (errcode_for_file_access(),
10293                                                  errmsg("could not fsync write-through file \"%s\": %m",
10294                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10295                         break;
10296 #endif
10297 #ifdef HAVE_FDATASYNC
10298                 case SYNC_METHOD_FDATASYNC:
10299                         if (pg_fdatasync(fd) != 0)
10300                                 ereport(PANIC,
10301                                                 (errcode_for_file_access(),
10302                                                  errmsg("could not fdatasync file \"%s\": %m",
10303                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10304                         break;
10305 #endif
10306                 case SYNC_METHOD_OPEN:
10307                 case SYNC_METHOD_OPEN_DSYNC:
10308                         /* write synced it already */
10309                         break;
10310                 default:
10311                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10312                         break;
10313         }
10314         pgstat_report_wait_end();
10315 }
10316
10317 /*
10318  * Return the filename of given log segment, as a palloc'd string.
10319  */
10320 char *
10321 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10322 {
10323         char       *result = palloc(MAXFNAMELEN);
10324
10325         XLogFileName(result, tli, segno, wal_segment_size);
10326         return result;
10327 }
10328
10329 /*
10330  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10331  * function. It creates the necessary starting checkpoint and constructs the
10332  * backup label file.
10333  *
10334  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10335  * backup is started with pg_start_backup(), and there can be only one active
10336  * at a time. The backup and tablespace map files of an exclusive backup are
10337  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10338  * removed by pg_stop_backup().
10339  *
10340  * A non-exclusive backup is used for the streaming base backups (see
10341  * src/backend/replication/basebackup.c). The difference to exclusive backups
10342  * is that the backup label and tablespace map files are not written to disk.
10343  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10344  * and the caller is responsible for including them in the backup archive as
10345  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10346  * active at the same time, and they don't conflict with an exclusive backup
10347  * either.
10348  *
10349  * tblspcmapfile is required mainly for tar format in windows as native windows
10350  * utilities are not able to create symlinks while extracting files from tar.
10351  * However for consistency, the same is used for all platforms.
10352  *
10353  * needtblspcmapfile is true for the cases (exclusive backup and for
10354  * non-exclusive backup only when tar format is used for taking backup)
10355  * when backup needs to generate tablespace_map file, it is used to
10356  * embed escape character before newline character in tablespace path.
10357  *
10358  * Returns the minimum WAL location that must be present to restore from this
10359  * backup, and the corresponding timeline ID in *starttli_p.
10360  *
10361  * Every successfully started non-exclusive backup must be stopped by calling
10362  * do_pg_stop_backup() or do_pg_abort_backup().
10363  *
10364  * It is the responsibility of the caller of this function to verify the
10365  * permissions of the calling user!
10366  */
10367 XLogRecPtr
10368 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10369                                    StringInfo labelfile, List **tablespaces,
10370                                    StringInfo tblspcmapfile, bool infotbssize,
10371                                    bool needtblspcmapfile)
10372 {
10373         bool            exclusive = (labelfile == NULL);
10374         bool            backup_started_in_recovery = false;
10375         XLogRecPtr      checkpointloc;
10376         XLogRecPtr      startpoint;
10377         TimeLineID      starttli;
10378         pg_time_t       stamp_time;
10379         char            strfbuf[128];
10380         char            xlogfilename[MAXFNAMELEN];
10381         XLogSegNo       _logSegNo;
10382         struct stat stat_buf;
10383         FILE       *fp;
10384
10385         backup_started_in_recovery = RecoveryInProgress();
10386
10387         /*
10388          * Currently only non-exclusive backup can be taken during recovery.
10389          */
10390         if (backup_started_in_recovery && exclusive)
10391                 ereport(ERROR,
10392                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10393                                  errmsg("recovery is in progress"),
10394                                  errhint("WAL control functions cannot be executed during recovery.")));
10395
10396         /*
10397          * During recovery, we don't need to check WAL level. Because, if WAL
10398          * level is not sufficient, it's impossible to get here during recovery.
10399          */
10400         if (!backup_started_in_recovery && !XLogIsNeeded())
10401                 ereport(ERROR,
10402                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10403                                  errmsg("WAL level not sufficient for making an online backup"),
10404                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10405
10406         if (strlen(backupidstr) > MAXPGPATH)
10407                 ereport(ERROR,
10408                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10409                                  errmsg("backup label too long (max %d bytes)",
10410                                                 MAXPGPATH)));
10411
10412         /*
10413          * Mark backup active in shared memory.  We must do full-page WAL writes
10414          * during an on-line backup even if not doing so at other times, because
10415          * it's quite possible for the backup dump to obtain a "torn" (partially
10416          * written) copy of a database page if it reads the page concurrently with
10417          * our write to the same page.  This can be fixed as long as the first
10418          * write to the page in the WAL sequence is a full-page write. Hence, we
10419          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10420          * are no dirty pages in shared memory that might get dumped while the
10421          * backup is in progress without having a corresponding WAL record.  (Once
10422          * the backup is complete, we need not force full-page writes anymore,
10423          * since we expect that any pages not modified during the backup interval
10424          * must have been correctly captured by the backup.)
10425          *
10426          * Note that forcePageWrites has no effect during an online backup from
10427          * the standby.
10428          *
10429          * We must hold all the insertion locks to change the value of
10430          * forcePageWrites, to ensure adequate interlocking against
10431          * XLogInsertRecord().
10432          */
10433         WALInsertLockAcquireExclusive();
10434         if (exclusive)
10435         {
10436                 /*
10437                  * At first, mark that we're now starting an exclusive backup, to
10438                  * ensure that there are no other sessions currently running
10439                  * pg_start_backup() or pg_stop_backup().
10440                  */
10441                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10442                 {
10443                         WALInsertLockRelease();
10444                         ereport(ERROR,
10445                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10446                                          errmsg("a backup is already in progress"),
10447                                          errhint("Run pg_stop_backup() and try again.")));
10448                 }
10449                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10450         }
10451         else
10452                 XLogCtl->Insert.nonExclusiveBackups++;
10453         XLogCtl->Insert.forcePageWrites = true;
10454         WALInsertLockRelease();
10455
10456         /* Ensure we release forcePageWrites if fail below */
10457         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10458         {
10459                 bool            gotUniqueStartpoint = false;
10460                 DIR                *tblspcdir;
10461                 struct dirent *de;
10462                 tablespaceinfo *ti;
10463                 int                     datadirpathlen;
10464
10465                 /*
10466                  * Force an XLOG file switch before the checkpoint, to ensure that the
10467                  * WAL segment the checkpoint is written to doesn't contain pages with
10468                  * old timeline IDs.  That would otherwise happen if you called
10469                  * pg_start_backup() right after restoring from a PITR archive: the
10470                  * first WAL segment containing the startup checkpoint has pages in
10471                  * the beginning with the old timeline ID.  That can cause trouble at
10472                  * recovery: we won't have a history file covering the old timeline if
10473                  * pg_wal directory was not included in the base backup and the WAL
10474                  * archive was cleared too before starting the backup.
10475                  *
10476                  * This also ensures that we have emitted a WAL page header that has
10477                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10478                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
10479                  * compress out removable backup blocks, it won't remove any that
10480                  * occur after this point.
10481                  *
10482                  * During recovery, we skip forcing XLOG file switch, which means that
10483                  * the backup taken during recovery is not available for the special
10484                  * recovery case described above.
10485                  */
10486                 if (!backup_started_in_recovery)
10487                         RequestXLogSwitch(false);
10488
10489                 do
10490                 {
10491                         bool            checkpointfpw;
10492
10493                         /*
10494                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10495                          * page problems, this guarantees that two successive backup runs
10496                          * will have different checkpoint positions and hence different
10497                          * history file names, even if nothing happened in between.
10498                          *
10499                          * During recovery, establish a restartpoint if possible. We use
10500                          * the last restartpoint as the backup starting checkpoint. This
10501                          * means that two successive backup runs can have same checkpoint
10502                          * positions.
10503                          *
10504                          * Since the fact that we are executing do_pg_start_backup()
10505                          * during recovery means that checkpointer is running, we can use
10506                          * RequestCheckpoint() to establish a restartpoint.
10507                          *
10508                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10509                          * passing fast = true).  Otherwise this can take awhile.
10510                          */
10511                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10512                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10513
10514                         /*
10515                          * Now we need to fetch the checkpoint record location, and also
10516                          * its REDO pointer.  The oldest point in WAL that would be needed
10517                          * to restore starting from the checkpoint is precisely the REDO
10518                          * pointer.
10519                          */
10520                         LWLockAcquire(ControlFileLock, LW_SHARED);
10521                         checkpointloc = ControlFile->checkPoint;
10522                         startpoint = ControlFile->checkPointCopy.redo;
10523                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10524                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10525                         LWLockRelease(ControlFileLock);
10526
10527                         if (backup_started_in_recovery)
10528                         {
10529                                 XLogRecPtr      recptr;
10530
10531                                 /*
10532                                  * Check to see if all WAL replayed during online backup
10533                                  * (i.e., since last restartpoint used as backup starting
10534                                  * checkpoint) contain full-page writes.
10535                                  */
10536                                 SpinLockAcquire(&XLogCtl->info_lck);
10537                                 recptr = XLogCtl->lastFpwDisableRecPtr;
10538                                 SpinLockRelease(&XLogCtl->info_lck);
10539
10540                                 if (!checkpointfpw || startpoint <= recptr)
10541                                         ereport(ERROR,
10542                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10543                                                          errmsg("WAL generated with full_page_writes=off was replayed "
10544                                                                         "since last restartpoint"),
10545                                                          errhint("This means that the backup being taken on the standby "
10546                                                                          "is corrupt and should not be used. "
10547                                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
10548                                                                          "and then try an online backup again.")));
10549
10550                                 /*
10551                                  * During recovery, since we don't use the end-of-backup WAL
10552                                  * record and don't write the backup history file, the
10553                                  * starting WAL location doesn't need to be unique. This means
10554                                  * that two base backups started at the same time might use
10555                                  * the same checkpoint as starting locations.
10556                                  */
10557                                 gotUniqueStartpoint = true;
10558                         }
10559
10560                         /*
10561                          * If two base backups are started at the same time (in WAL sender
10562                          * processes), we need to make sure that they use different
10563                          * checkpoints as starting locations, because we use the starting
10564                          * WAL location as a unique identifier for the base backup in the
10565                          * end-of-backup WAL record and when we write the backup history
10566                          * file. Perhaps it would be better generate a separate unique ID
10567                          * for each backup instead of forcing another checkpoint, but
10568                          * taking a checkpoint right after another is not that expensive
10569                          * either because only few buffers have been dirtied yet.
10570                          */
10571                         WALInsertLockAcquireExclusive();
10572                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10573                         {
10574                                 XLogCtl->Insert.lastBackupStart = startpoint;
10575                                 gotUniqueStartpoint = true;
10576                         }
10577                         WALInsertLockRelease();
10578                 } while (!gotUniqueStartpoint);
10579
10580                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10581                 XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10582
10583                 /*
10584                  * Construct tablespace_map file
10585                  */
10586                 if (exclusive)
10587                         tblspcmapfile = makeStringInfo();
10588
10589                 datadirpathlen = strlen(DataDir);
10590
10591                 /* Collect information about all tablespaces */
10592                 tblspcdir = AllocateDir("pg_tblspc");
10593                 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10594                 {
10595                         char            fullpath[MAXPGPATH + 10];
10596                         char            linkpath[MAXPGPATH];
10597                         char       *relpath = NULL;
10598                         int                     rllen;
10599                         StringInfoData buflinkpath;
10600                         char       *s = linkpath;
10601
10602                         /* Skip special stuff */
10603                         if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10604                                 continue;
10605
10606                         snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10607
10608 #if defined(HAVE_READLINK) || defined(WIN32)
10609                         rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10610                         if (rllen < 0)
10611                         {
10612                                 ereport(WARNING,
10613                                                 (errmsg("could not read symbolic link \"%s\": %m",
10614                                                                 fullpath)));
10615                                 continue;
10616                         }
10617                         else if (rllen >= sizeof(linkpath))
10618                         {
10619                                 ereport(WARNING,
10620                                                 (errmsg("symbolic link \"%s\" target is too long",
10621                                                                 fullpath)));
10622                                 continue;
10623                         }
10624                         linkpath[rllen] = '\0';
10625
10626                         /*
10627                          * Add the escape character '\\' before newline in a string to
10628                          * ensure that we can distinguish between the newline in the
10629                          * tablespace path and end of line while reading tablespace_map
10630                          * file during archive recovery.
10631                          */
10632                         initStringInfo(&buflinkpath);
10633
10634                         while (*s)
10635                         {
10636                                 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10637                                         appendStringInfoChar(&buflinkpath, '\\');
10638                                 appendStringInfoChar(&buflinkpath, *s++);
10639                         }
10640
10641                         /*
10642                          * Relpath holds the relative path of the tablespace directory
10643                          * when it's located within PGDATA, or NULL if it's located
10644                          * elsewhere.
10645                          */
10646                         if (rllen > datadirpathlen &&
10647                                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10648                                 IS_DIR_SEP(linkpath[datadirpathlen]))
10649                                 relpath = linkpath + datadirpathlen + 1;
10650
10651                         ti = palloc(sizeof(tablespaceinfo));
10652                         ti->oid = pstrdup(de->d_name);
10653                         ti->path = pstrdup(buflinkpath.data);
10654                         ti->rpath = relpath ? pstrdup(relpath) : NULL;
10655                         ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10656
10657                         if (tablespaces)
10658                                 *tablespaces = lappend(*tablespaces, ti);
10659
10660                         appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10661
10662                         pfree(buflinkpath.data);
10663 #else
10664
10665                         /*
10666                          * If the platform does not have symbolic links, it should not be
10667                          * possible to have tablespaces - clearly somebody else created
10668                          * them. Warn about it and ignore.
10669                          */
10670                         ereport(WARNING,
10671                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10672                                          errmsg("tablespaces are not supported on this platform")));
10673 #endif
10674                 }
10675                 FreeDir(tblspcdir);
10676
10677                 /*
10678                  * Construct backup label file
10679                  */
10680                 if (exclusive)
10681                         labelfile = makeStringInfo();
10682
10683                 /* Use the log timezone here, not the session timezone */
10684                 stamp_time = (pg_time_t) time(NULL);
10685                 pg_strftime(strfbuf, sizeof(strfbuf),
10686                                         "%Y-%m-%d %H:%M:%S %Z",
10687                                         pg_localtime(&stamp_time, log_timezone));
10688                 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10689                                                  (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10690                 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10691                                                  (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10692                 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10693                                                  exclusive ? "pg_start_backup" : "streamed");
10694                 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10695                                                  backup_started_in_recovery ? "standby" : "master");
10696                 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10697                 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10698                 appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10699
10700                 /*
10701                  * Okay, write the file, or return its contents to caller.
10702                  */
10703                 if (exclusive)
10704                 {
10705                         /*
10706                          * Check for existing backup label --- implies a backup is already
10707                          * running.  (XXX given that we checked exclusiveBackupState
10708                          * above, maybe it would be OK to just unlink any such label
10709                          * file?)
10710                          */
10711                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10712                         {
10713                                 if (errno != ENOENT)
10714                                         ereport(ERROR,
10715                                                         (errcode_for_file_access(),
10716                                                          errmsg("could not stat file \"%s\": %m",
10717                                                                         BACKUP_LABEL_FILE)));
10718                         }
10719                         else
10720                                 ereport(ERROR,
10721                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10722                                                  errmsg("a backup is already in progress"),
10723                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10724                                                                  BACKUP_LABEL_FILE)));
10725
10726                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10727
10728                         if (!fp)
10729                                 ereport(ERROR,
10730                                                 (errcode_for_file_access(),
10731                                                  errmsg("could not create file \"%s\": %m",
10732                                                                 BACKUP_LABEL_FILE)));
10733                         if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10734                                 fflush(fp) != 0 ||
10735                                 pg_fsync(fileno(fp)) != 0 ||
10736                                 ferror(fp) ||
10737                                 FreeFile(fp))
10738                                 ereport(ERROR,
10739                                                 (errcode_for_file_access(),
10740                                                  errmsg("could not write file \"%s\": %m",
10741                                                                 BACKUP_LABEL_FILE)));
10742                         /* Allocated locally for exclusive backups, so free separately */
10743                         pfree(labelfile->data);
10744                         pfree(labelfile);
10745
10746                         /* Write backup tablespace_map file. */
10747                         if (tblspcmapfile->len > 0)
10748                         {
10749                                 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10750                                 {
10751                                         if (errno != ENOENT)
10752                                                 ereport(ERROR,
10753                                                                 (errcode_for_file_access(),
10754                                                                  errmsg("could not stat file \"%s\": %m",
10755                                                                                 TABLESPACE_MAP)));
10756                                 }
10757                                 else
10758                                         ereport(ERROR,
10759                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10760                                                          errmsg("a backup is already in progress"),
10761                                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10762                                                                          TABLESPACE_MAP)));
10763
10764                                 fp = AllocateFile(TABLESPACE_MAP, "w");
10765
10766                                 if (!fp)
10767                                         ereport(ERROR,
10768                                                         (errcode_for_file_access(),
10769                                                          errmsg("could not create file \"%s\": %m",
10770                                                                         TABLESPACE_MAP)));
10771                                 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10772                                         fflush(fp) != 0 ||
10773                                         pg_fsync(fileno(fp)) != 0 ||
10774                                         ferror(fp) ||
10775                                         FreeFile(fp))
10776                                         ereport(ERROR,
10777                                                         (errcode_for_file_access(),
10778                                                          errmsg("could not write file \"%s\": %m",
10779                                                                         TABLESPACE_MAP)));
10780                         }
10781
10782                         /* Allocated locally for exclusive backups, so free separately */
10783                         pfree(tblspcmapfile->data);
10784                         pfree(tblspcmapfile);
10785                 }
10786         }
10787         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10788
10789         /*
10790          * Mark that start phase has correctly finished for an exclusive backup.
10791          * Session-level locks are updated as well to reflect that state.
10792          *
10793          * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
10794          * counters and session-level lock. Otherwise they can be updated
10795          * inconsistently, and which might cause do_pg_abort_backup() to fail.
10796          */
10797         if (exclusive)
10798         {
10799                 WALInsertLockAcquireExclusive();
10800                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10801
10802                 /* Set session-level lock */
10803                 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10804                 WALInsertLockRelease();
10805         }
10806         else
10807                 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10808
10809         /*
10810          * We're done.  As a convenience, return the starting WAL location.
10811          */
10812         if (starttli_p)
10813                 *starttli_p = starttli;
10814         return startpoint;
10815 }
10816
10817 /* Error cleanup callback for pg_start_backup */
10818 static void
10819 pg_start_backup_callback(int code, Datum arg)
10820 {
10821         bool            exclusive = DatumGetBool(arg);
10822
10823         /* Update backup counters and forcePageWrites on failure */
10824         WALInsertLockAcquireExclusive();
10825         if (exclusive)
10826         {
10827                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10828                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10829         }
10830         else
10831         {
10832                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10833                 XLogCtl->Insert.nonExclusiveBackups--;
10834         }
10835
10836         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10837                 XLogCtl->Insert.nonExclusiveBackups == 0)
10838         {
10839                 XLogCtl->Insert.forcePageWrites = false;
10840         }
10841         WALInsertLockRelease();
10842 }
10843
10844 /*
10845  * Error cleanup callback for pg_stop_backup
10846  */
10847 static void
10848 pg_stop_backup_callback(int code, Datum arg)
10849 {
10850         bool            exclusive = DatumGetBool(arg);
10851
10852         /* Update backup status on failure */
10853         WALInsertLockAcquireExclusive();
10854         if (exclusive)
10855         {
10856                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10857                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10858         }
10859         WALInsertLockRelease();
10860 }
10861
10862 /*
10863  * Utility routine to fetch the session-level status of a backup running.
10864  */
10865 SessionBackupState
10866 get_backup_status(void)
10867 {
10868         return sessionBackupState;
10869 }
10870
10871 /*
10872  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10873  * function.
10874  *
10875  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10876  * the non-exclusive backup specified by 'labelfile'.
10877  *
10878  * Returns the last WAL location that must be present to restore from this
10879  * backup, and the corresponding timeline ID in *stoptli_p.
10880  *
10881  * It is the responsibility of the caller of this function to verify the
10882  * permissions of the calling user!
10883  */
10884 XLogRecPtr
10885 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10886 {
10887         bool            exclusive = (labelfile == NULL);
10888         bool            backup_started_in_recovery = false;
10889         XLogRecPtr      startpoint;
10890         XLogRecPtr      stoppoint;
10891         TimeLineID      stoptli;
10892         pg_time_t       stamp_time;
10893         char            strfbuf[128];
10894         char            histfilepath[MAXPGPATH];
10895         char            startxlogfilename[MAXFNAMELEN];
10896         char            stopxlogfilename[MAXFNAMELEN];
10897         char            lastxlogfilename[MAXFNAMELEN];
10898         char            histfilename[MAXFNAMELEN];
10899         char            backupfrom[20];
10900         XLogSegNo       _logSegNo;
10901         FILE       *lfp;
10902         FILE       *fp;
10903         char            ch;
10904         int                     seconds_before_warning;
10905         int                     waits = 0;
10906         bool            reported_waiting = false;
10907         char       *remaining;
10908         char       *ptr;
10909         uint32          hi,
10910                                 lo;
10911
10912         backup_started_in_recovery = RecoveryInProgress();
10913
10914         /*
10915          * Currently only non-exclusive backup can be taken during recovery.
10916          */
10917         if (backup_started_in_recovery && exclusive)
10918                 ereport(ERROR,
10919                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10920                                  errmsg("recovery is in progress"),
10921                                  errhint("WAL control functions cannot be executed during recovery.")));
10922
10923         /*
10924          * During recovery, we don't need to check WAL level. Because, if WAL
10925          * level is not sufficient, it's impossible to get here during recovery.
10926          */
10927         if (!backup_started_in_recovery && !XLogIsNeeded())
10928                 ereport(ERROR,
10929                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10930                                  errmsg("WAL level not sufficient for making an online backup"),
10931                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10932
10933         if (exclusive)
10934         {
10935                 /*
10936                  * At first, mark that we're now stopping an exclusive backup, to
10937                  * ensure that there are no other sessions currently running
10938                  * pg_start_backup() or pg_stop_backup().
10939                  */
10940                 WALInsertLockAcquireExclusive();
10941                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10942                 {
10943                         WALInsertLockRelease();
10944                         ereport(ERROR,
10945                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10946                                          errmsg("exclusive backup not in progress")));
10947                 }
10948                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10949                 WALInsertLockRelease();
10950
10951                 /*
10952                  * Remove backup_label. In case of failure, the state for an exclusive
10953                  * backup is switched back to in-progress.
10954                  */
10955                 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10956                 {
10957                         /*
10958                          * Read the existing label file into memory.
10959                          */
10960                         struct stat statbuf;
10961                         int                     r;
10962
10963                         if (stat(BACKUP_LABEL_FILE, &statbuf))
10964                         {
10965                                 /* should not happen per the upper checks */
10966                                 if (errno != ENOENT)
10967                                         ereport(ERROR,
10968                                                         (errcode_for_file_access(),
10969                                                          errmsg("could not stat file \"%s\": %m",
10970                                                                         BACKUP_LABEL_FILE)));
10971                                 ereport(ERROR,
10972                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10973                                                  errmsg("a backup is not in progress")));
10974                         }
10975
10976                         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10977                         if (!lfp)
10978                         {
10979                                 ereport(ERROR,
10980                                                 (errcode_for_file_access(),
10981                                                  errmsg("could not read file \"%s\": %m",
10982                                                                 BACKUP_LABEL_FILE)));
10983                         }
10984                         labelfile = palloc(statbuf.st_size + 1);
10985                         r = fread(labelfile, statbuf.st_size, 1, lfp);
10986                         labelfile[statbuf.st_size] = '\0';
10987
10988                         /*
10989                          * Close and remove the backup label file
10990                          */
10991                         if (r != 1 || ferror(lfp) || FreeFile(lfp))
10992                                 ereport(ERROR,
10993                                                 (errcode_for_file_access(),
10994                                                  errmsg("could not read file \"%s\": %m",
10995                                                                 BACKUP_LABEL_FILE)));
10996                         durable_unlink(BACKUP_LABEL_FILE, ERROR);
10997
10998                         /*
10999                          * Remove tablespace_map file if present, it is created only if
11000                          * there are tablespaces.
11001                          */
11002                         durable_unlink(TABLESPACE_MAP, DEBUG1);
11003                 }
11004                 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11005         }
11006
11007         /*
11008          * OK to update backup counters, forcePageWrites and session-level lock.
11009          *
11010          * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11011          * Otherwise they can be updated inconsistently, and which might cause
11012          * do_pg_abort_backup() to fail.
11013          */
11014         WALInsertLockAcquireExclusive();
11015         if (exclusive)
11016         {
11017                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11018         }
11019         else
11020         {
11021                 /*
11022                  * The user-visible pg_start/stop_backup() functions that operate on
11023                  * exclusive backups can be called at any time, but for non-exclusive
11024                  * backups, it is expected that each do_pg_start_backup() call is
11025                  * matched by exactly one do_pg_stop_backup() call.
11026                  */
11027                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11028                 XLogCtl->Insert.nonExclusiveBackups--;
11029         }
11030
11031         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11032                 XLogCtl->Insert.nonExclusiveBackups == 0)
11033         {
11034                 XLogCtl->Insert.forcePageWrites = false;
11035         }
11036
11037         /*
11038          * Clean up session-level lock.
11039          *
11040          * You might think that WALInsertLockRelease() can be called before
11041          * cleaning up session-level lock because session-level lock doesn't need
11042          * to be protected with WAL insertion lock. But since
11043          * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11044          * cleaned up before it.
11045          */
11046         sessionBackupState = SESSION_BACKUP_NONE;
11047
11048         WALInsertLockRelease();
11049
11050         /*
11051          * Read and parse the START WAL LOCATION line (this code is pretty crude,
11052          * but we are not expecting any variability in the file format).
11053          */
11054         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11055                            &hi, &lo, startxlogfilename,
11056                            &ch) != 4 || ch != '\n')
11057                 ereport(ERROR,
11058                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11059                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11060         startpoint = ((uint64) hi) << 32 | lo;
11061         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
11062
11063         /*
11064          * Parse the BACKUP FROM line. If we are taking an online backup from the
11065          * standby, we confirm that the standby has not been promoted during the
11066          * backup.
11067          */
11068         ptr = strstr(remaining, "BACKUP FROM:");
11069         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11070                 ereport(ERROR,
11071                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11072                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11073         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11074                 ereport(ERROR,
11075                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11076                                  errmsg("the standby was promoted during online backup"),
11077                                  errhint("This means that the backup being taken is corrupt "
11078                                                  "and should not be used. "
11079                                                  "Try taking another online backup.")));
11080
11081         /*
11082          * During recovery, we don't write an end-of-backup record. We assume that
11083          * pg_control was backed up last and its minimum recovery point can be
11084          * available as the backup end location. Since we don't have an
11085          * end-of-backup record, we use the pg_control value to check whether
11086          * we've reached the end of backup when starting recovery from this
11087          * backup. We have no way of checking if pg_control wasn't backed up last
11088          * however.
11089          *
11090          * We don't force a switch to new WAL file but it is still possible to
11091          * wait for all the required files to be archived if waitforarchive is
11092          * true. This is okay if we use the backup to start a standby and fetch
11093          * the missing WAL using streaming replication. But in the case of an
11094          * archive recovery, a user should set waitforarchive to true and wait for
11095          * them to be archived to ensure that all the required files are
11096          * available.
11097          *
11098          * We return the current minimum recovery point as the backup end
11099          * location. Note that it can be greater than the exact backup end
11100          * location if the minimum recovery point is updated after the backup of
11101          * pg_control. This is harmless for current uses.
11102          *
11103          * XXX currently a backup history file is for informational and debug
11104          * purposes only. It's not essential for an online backup. Furthermore,
11105          * even if it's created, it will not be archived during recovery because
11106          * an archiver is not invoked. So it doesn't seem worthwhile to write a
11107          * backup history file during recovery.
11108          */
11109         if (backup_started_in_recovery)
11110         {
11111                 XLogRecPtr      recptr;
11112
11113                 /*
11114                  * Check to see if all WAL replayed during online backup contain
11115                  * full-page writes.
11116                  */
11117                 SpinLockAcquire(&XLogCtl->info_lck);
11118                 recptr = XLogCtl->lastFpwDisableRecPtr;
11119                 SpinLockRelease(&XLogCtl->info_lck);
11120
11121                 if (startpoint <= recptr)
11122                         ereport(ERROR,
11123                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11124                                          errmsg("WAL generated with full_page_writes=off was replayed "
11125                                                         "during online backup"),
11126                                          errhint("This means that the backup being taken on the standby "
11127                                                          "is corrupt and should not be used. "
11128                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
11129                                                          "and then try an online backup again.")));
11130
11131
11132                 LWLockAcquire(ControlFileLock, LW_SHARED);
11133                 stoppoint = ControlFile->minRecoveryPoint;
11134                 stoptli = ControlFile->minRecoveryPointTLI;
11135                 LWLockRelease(ControlFileLock);
11136         }
11137         else
11138         {
11139                 /*
11140                  * Write the backup-end xlog record
11141                  */
11142                 XLogBeginInsert();
11143                 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11144                 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11145                 stoptli = ThisTimeLineID;
11146
11147                 /*
11148                  * Force a switch to a new xlog segment file, so that the backup is
11149                  * valid as soon as archiver moves out the current segment file.
11150                  */
11151                 RequestXLogSwitch(false);
11152
11153                 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11154                 XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11155
11156                 /* Use the log timezone here, not the session timezone */
11157                 stamp_time = (pg_time_t) time(NULL);
11158                 pg_strftime(strfbuf, sizeof(strfbuf),
11159                                         "%Y-%m-%d %H:%M:%S %Z",
11160                                         pg_localtime(&stamp_time, log_timezone));
11161
11162                 /*
11163                  * Write the backup history file
11164                  */
11165                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11166                 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11167                                                           startpoint, wal_segment_size);
11168                 fp = AllocateFile(histfilepath, "w");
11169                 if (!fp)
11170                         ereport(ERROR,
11171                                         (errcode_for_file_access(),
11172                                          errmsg("could not create file \"%s\": %m",
11173                                                         histfilepath)));
11174                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11175                                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11176                 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11177                                 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11178
11179                 /*
11180                  * Transfer remaining lines including label and start timeline to
11181                  * history file.
11182                  */
11183                 fprintf(fp, "%s", remaining);
11184                 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11185                 fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11186                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11187                         ereport(ERROR,
11188                                         (errcode_for_file_access(),
11189                                          errmsg("could not write file \"%s\": %m",
11190                                                         histfilepath)));
11191
11192                 /*
11193                  * Clean out any no-longer-needed history files.  As a side effect,
11194                  * this will post a .ready file for the newly created history file,
11195                  * notifying the archiver that history file may be archived
11196                  * immediately.
11197                  */
11198                 CleanupBackupHistory();
11199         }
11200
11201         /*
11202          * If archiving is enabled, wait for all the required WAL files to be
11203          * archived before returning. If archiving isn't enabled, the required WAL
11204          * needs to be transported via streaming replication (hopefully with
11205          * wal_keep_segments set high enough), or some more exotic mechanism like
11206          * polling and copying files from pg_wal with script. We have no knowledge
11207          * of those mechanisms, so it's up to the user to ensure that he gets all
11208          * the required WAL.
11209          *
11210          * We wait until both the last WAL file filled during backup and the
11211          * history file have been archived, and assume that the alphabetic sorting
11212          * property of the WAL files ensures any earlier WAL files are safely
11213          * archived as well.
11214          *
11215          * We wait forever, since archive_command is supposed to work and we
11216          * assume the admin wanted his backup to work completely. If you don't
11217          * wish to wait, then either waitforarchive should be passed in as false,
11218          * or you can set statement_timeout.  Also, some notices are issued to
11219          * clue in anyone who might be doing this interactively.
11220          */
11221
11222         if (waitforarchive &&
11223                 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11224                  (backup_started_in_recovery && XLogArchivingAlways())))
11225         {
11226                 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11227                 XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11228
11229                 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11230                 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11231                                                           startpoint, wal_segment_size);
11232
11233                 seconds_before_warning = 60;
11234                 waits = 0;
11235
11236                 while (XLogArchiveIsBusy(lastxlogfilename) ||
11237                            XLogArchiveIsBusy(histfilename))
11238                 {
11239                         CHECK_FOR_INTERRUPTS();
11240
11241                         if (!reported_waiting && waits > 5)
11242                         {
11243                                 ereport(NOTICE,
11244                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
11245                                 reported_waiting = true;
11246                         }
11247
11248                         pg_usleep(1000000L);
11249
11250                         if (++waits >= seconds_before_warning)
11251                         {
11252                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
11253                                 ereport(WARNING,
11254                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11255                                                                 waits),
11256                                                  errhint("Check that your archive_command is executing properly.  "
11257                                                                  "pg_stop_backup can be canceled safely, "
11258                                                                  "but the database backup will not be usable without all the WAL segments.")));
11259                         }
11260                 }
11261
11262                 ereport(NOTICE,
11263                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11264         }
11265         else if (waitforarchive)
11266                 ereport(NOTICE,
11267                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11268
11269         /*
11270          * We're done.  As a convenience, return the ending WAL location.
11271          */
11272         if (stoptli_p)
11273                 *stoptli_p = stoptli;
11274         return stoppoint;
11275 }
11276
11277
11278 /*
11279  * do_pg_abort_backup: abort a running backup
11280  *
11281  * This does just the most basic steps of do_pg_stop_backup(), by taking the
11282  * system out of backup mode, thus making it a lot more safe to call from
11283  * an error handler.
11284  *
11285  * NB: This is only for aborting a non-exclusive backup that doesn't write
11286  * backup_label. A backup started with pg_start_backup() needs to be finished
11287  * with pg_stop_backup().
11288  */
11289 void
11290 do_pg_abort_backup(void)
11291 {
11292         /*
11293          * Quick exit if session is not keeping around a non-exclusive backup
11294          * already started.
11295          */
11296         if (sessionBackupState == SESSION_BACKUP_NONE)
11297                 return;
11298
11299         WALInsertLockAcquireExclusive();
11300         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11301         Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11302         XLogCtl->Insert.nonExclusiveBackups--;
11303
11304         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11305                 XLogCtl->Insert.nonExclusiveBackups == 0)
11306         {
11307                 XLogCtl->Insert.forcePageWrites = false;
11308         }
11309         WALInsertLockRelease();
11310 }
11311
11312 /*
11313  * Get latest redo apply position.
11314  *
11315  * Exported to allow WALReceiver to read the pointer directly.
11316  */
11317 XLogRecPtr
11318 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11319 {
11320         XLogRecPtr      recptr;
11321         TimeLineID      tli;
11322
11323         SpinLockAcquire(&XLogCtl->info_lck);
11324         recptr = XLogCtl->lastReplayedEndRecPtr;
11325         tli = XLogCtl->lastReplayedTLI;
11326         SpinLockRelease(&XLogCtl->info_lck);
11327
11328         if (replayTLI)
11329                 *replayTLI = tli;
11330         return recptr;
11331 }
11332
11333 /*
11334  * Get latest WAL insert pointer
11335  */
11336 XLogRecPtr
11337 GetXLogInsertRecPtr(void)
11338 {
11339         XLogCtlInsert *Insert = &XLogCtl->Insert;
11340         uint64          current_bytepos;
11341
11342         SpinLockAcquire(&Insert->insertpos_lck);
11343         current_bytepos = Insert->CurrBytePos;
11344         SpinLockRelease(&Insert->insertpos_lck);
11345
11346         return XLogBytePosToRecPtr(current_bytepos);
11347 }
11348
11349 /*
11350  * Get latest WAL write pointer
11351  */
11352 XLogRecPtr
11353 GetXLogWriteRecPtr(void)
11354 {
11355         SpinLockAcquire(&XLogCtl->info_lck);
11356         LogwrtResult = XLogCtl->LogwrtResult;
11357         SpinLockRelease(&XLogCtl->info_lck);
11358
11359         return LogwrtResult.Write;
11360 }
11361
11362 /*
11363  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11364  * the oldest point in WAL that we still need, if we have to restart recovery.
11365  */
11366 void
11367 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11368 {
11369         LWLockAcquire(ControlFileLock, LW_SHARED);
11370         *oldrecptr = ControlFile->checkPointCopy.redo;
11371         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11372         LWLockRelease(ControlFileLock);
11373 }
11374
11375 /*
11376  * read_backup_label: check to see if a backup_label file is present
11377  *
11378  * If we see a backup_label during recovery, we assume that we are recovering
11379  * from a backup dump file, and we therefore roll forward from the checkpoint
11380  * identified by the label file, NOT what pg_control says.  This avoids the
11381  * problem that pg_control might have been archived one or more checkpoints
11382  * later than the start of the dump, and so if we rely on it as the start
11383  * point, we will fail to restore a consistent database state.
11384  *
11385  * Returns true if a backup_label was found (and fills the checkpoint
11386  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11387  * respectively); returns false if not. If this backup_label came from a
11388  * streamed backup, *backupEndRequired is set to true. If this backup_label
11389  * was created during recovery, *backupFromStandby is set to true.
11390  */
11391 static bool
11392 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11393                                   bool *backupFromStandby)
11394 {
11395         char            startxlogfilename[MAXFNAMELEN];
11396         TimeLineID      tli_from_walseg,
11397                                 tli_from_file;
11398         FILE       *lfp;
11399         char            ch;
11400         char            backuptype[20];
11401         char            backupfrom[20];
11402         char            backuplabel[MAXPGPATH];
11403         char            backuptime[128];
11404         uint32          hi,
11405                                 lo;
11406
11407         *backupEndRequired = false;
11408         *backupFromStandby = false;
11409
11410         /*
11411          * See if label file is present
11412          */
11413         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11414         if (!lfp)
11415         {
11416                 if (errno != ENOENT)
11417                         ereport(FATAL,
11418                                         (errcode_for_file_access(),
11419                                          errmsg("could not read file \"%s\": %m",
11420                                                         BACKUP_LABEL_FILE)));
11421                 return false;                   /* it's not there, all is fine */
11422         }
11423
11424         /*
11425          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11426          * is pretty crude, but we are not expecting any variability in the file
11427          * format).
11428          */
11429         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11430                            &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11431                 ereport(FATAL,
11432                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11433                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11434         RedoStartLSN = ((uint64) hi) << 32 | lo;
11435         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11436                            &hi, &lo, &ch) != 3 || ch != '\n')
11437                 ereport(FATAL,
11438                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11439                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11440         *checkPointLoc = ((uint64) hi) << 32 | lo;
11441
11442         /*
11443          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11444          * from an older backup anyway, but since the information on it is not
11445          * strictly required, don't error out if it's missing for some reason.
11446          */
11447         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11448         {
11449                 if (strcmp(backuptype, "streamed") == 0)
11450                         *backupEndRequired = true;
11451         }
11452
11453         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11454         {
11455                 if (strcmp(backupfrom, "standby") == 0)
11456                         *backupFromStandby = true;
11457         }
11458
11459         /*
11460          * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11461          * but checking for their presence is useful for debugging and the next
11462          * sanity checks. Cope also with the fact that the result buffers have a
11463          * pre-allocated size, hence if the backup_label file has been generated
11464          * with strings longer than the maximum assumed here an incorrect parsing
11465          * happens. That's fine as only minor consistency checks are done
11466          * afterwards.
11467          */
11468         if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11469                 ereport(DEBUG1,
11470                                 (errmsg("backup time %s in file \"%s\"",
11471                                                 backuptime, BACKUP_LABEL_FILE)));
11472
11473         if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11474                 ereport(DEBUG1,
11475                                 (errmsg("backup label %s in file \"%s\"",
11476                                                 backuplabel, BACKUP_LABEL_FILE)));
11477
11478         /*
11479          * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11480          * it as a sanity check if present.
11481          */
11482         if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11483         {
11484                 if (tli_from_walseg != tli_from_file)
11485                         ereport(FATAL,
11486                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11487                                          errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11488                                          errdetail("Timeline ID parsed is %u, but expected %u",
11489                                                            tli_from_file, tli_from_walseg)));
11490
11491                 ereport(DEBUG1,
11492                                 (errmsg("backup timeline %u in file \"%s\"",
11493                                                 tli_from_file, BACKUP_LABEL_FILE)));
11494         }
11495
11496         if (ferror(lfp) || FreeFile(lfp))
11497                 ereport(FATAL,
11498                                 (errcode_for_file_access(),
11499                                  errmsg("could not read file \"%s\": %m",
11500                                                 BACKUP_LABEL_FILE)));
11501
11502         return true;
11503 }
11504
11505 /*
11506  * read_tablespace_map: check to see if a tablespace_map file is present
11507  *
11508  * If we see a tablespace_map file during recovery, we assume that we are
11509  * recovering from a backup dump file, and we therefore need to create symlinks
11510  * as per the information present in tablespace_map file.
11511  *
11512  * Returns true if a tablespace_map file was found (and fills the link
11513  * information for all the tablespace links present in file); returns false
11514  * if not.
11515  */
11516 static bool
11517 read_tablespace_map(List **tablespaces)
11518 {
11519         tablespaceinfo *ti;
11520         FILE       *lfp;
11521         char            tbsoid[MAXPGPATH];
11522         char       *tbslinkpath;
11523         char            str[MAXPGPATH];
11524         int                     ch,
11525                                 prev_ch = -1,
11526                                 i = 0,
11527                                 n;
11528
11529         /*
11530          * See if tablespace_map file is present
11531          */
11532         lfp = AllocateFile(TABLESPACE_MAP, "r");
11533         if (!lfp)
11534         {
11535                 if (errno != ENOENT)
11536                         ereport(FATAL,
11537                                         (errcode_for_file_access(),
11538                                          errmsg("could not read file \"%s\": %m",
11539                                                         TABLESPACE_MAP)));
11540                 return false;                   /* it's not there, all is fine */
11541         }
11542
11543         /*
11544          * Read and parse the link name and path lines from tablespace_map file
11545          * (this code is pretty crude, but we are not expecting any variability in
11546          * the file format).  While taking backup we embed escape character '\\'
11547          * before newline in tablespace path, so that during reading of
11548          * tablespace_map file, we could distinguish newline in tablespace path
11549          * and end of line.  Now while reading tablespace_map file, remove the
11550          * escape character that has been added in tablespace path during backup.
11551          */
11552         while ((ch = fgetc(lfp)) != EOF)
11553         {
11554                 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11555                 {
11556                         str[i] = '\0';
11557                         if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11558                                 ereport(FATAL,
11559                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11560                                                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11561                         tbslinkpath = str + n;
11562                         i = 0;
11563
11564                         ti = palloc(sizeof(tablespaceinfo));
11565                         ti->oid = pstrdup(tbsoid);
11566                         ti->path = pstrdup(tbslinkpath);
11567
11568                         *tablespaces = lappend(*tablespaces, ti);
11569                         continue;
11570                 }
11571                 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11572                         str[i - 1] = ch;
11573                 else
11574                         str[i++] = ch;
11575                 prev_ch = ch;
11576         }
11577
11578         if (ferror(lfp) || FreeFile(lfp))
11579                 ereport(FATAL,
11580                                 (errcode_for_file_access(),
11581                                  errmsg("could not read file \"%s\": %m",
11582                                                 TABLESPACE_MAP)));
11583
11584         return true;
11585 }
11586
11587 /*
11588  * Error context callback for errors occurring during rm_redo().
11589  */
11590 static void
11591 rm_redo_error_callback(void *arg)
11592 {
11593         XLogReaderState *record = (XLogReaderState *) arg;
11594         StringInfoData buf;
11595
11596         initStringInfo(&buf);
11597         xlog_outdesc(&buf, record);
11598
11599         /* translator: %s is a WAL record description */
11600         errcontext("WAL redo at %X/%X for %s",
11601                            (uint32) (record->ReadRecPtr >> 32),
11602                            (uint32) record->ReadRecPtr,
11603                            buf.data);
11604
11605         pfree(buf.data);
11606 }
11607
11608 /*
11609  * BackupInProgress: check if online backup mode is active
11610  *
11611  * This is done by checking for existence of the "backup_label" file.
11612  */
11613 bool
11614 BackupInProgress(void)
11615 {
11616         struct stat stat_buf;
11617
11618         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11619 }
11620
11621 /*
11622  * CancelBackup: rename the "backup_label" and "tablespace_map"
11623  *                               files to cancel backup mode
11624  *
11625  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11626  * Similarly, if the "tablespace_map" file exists, it will be renamed to
11627  * "tablespace_map.old".
11628  *
11629  * Note that this will render an online backup in progress
11630  * useless. To correctly finish an online backup, pg_stop_backup must be
11631  * called.
11632  */
11633 void
11634 CancelBackup(void)
11635 {
11636         struct stat stat_buf;
11637
11638         /* if the backup_label file is not there, return */
11639         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11640                 return;
11641
11642         /* remove leftover file from previously canceled backup if it exists */
11643         unlink(BACKUP_LABEL_OLD);
11644
11645         if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11646         {
11647                 ereport(WARNING,
11648                                 (errcode_for_file_access(),
11649                                  errmsg("online backup mode was not canceled"),
11650                                  errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11651                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11652                 return;
11653         }
11654
11655         /* if the tablespace_map file is not there, return */
11656         if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11657         {
11658                 ereport(LOG,
11659                                 (errmsg("online backup mode canceled"),
11660                                  errdetail("File \"%s\" was renamed to \"%s\".",
11661                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11662                 return;
11663         }
11664
11665         /* remove leftover file from previously canceled backup if it exists */
11666         unlink(TABLESPACE_MAP_OLD);
11667
11668         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11669         {
11670                 ereport(LOG,
11671                                 (errmsg("online backup mode canceled"),
11672                                  errdetail("Files \"%s\" and \"%s\" were renamed to "
11673                                                    "\"%s\" and \"%s\", respectively.",
11674                                                    BACKUP_LABEL_FILE, TABLESPACE_MAP,
11675                                                    BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11676         }
11677         else
11678         {
11679                 ereport(WARNING,
11680                                 (errcode_for_file_access(),
11681                                  errmsg("online backup mode canceled"),
11682                                  errdetail("File \"%s\" was renamed to \"%s\", but "
11683                                                    "file \"%s\" could not be renamed to \"%s\": %m.",
11684                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11685                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11686         }
11687 }
11688
11689 /*
11690  * Read the XLOG page containing RecPtr into readBuf (if not read already).
11691  * Returns number of bytes read, if the page is read successfully, or -1
11692  * in case of errors.  When errors occur, they are ereport'ed, but only
11693  * if they have not been previously reported.
11694  *
11695  * This is responsible for restoring files from archive as needed, as well
11696  * as for waiting for the requested WAL record to arrive in standby mode.
11697  *
11698  * 'emode' specifies the log level used for reporting "file not found" or
11699  * "end of WAL" situations in archive recovery, or in standby mode when a
11700  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11701  * false in those situations, on higher log levels the ereport() won't
11702  * return.
11703  *
11704  * In standby mode, if after a successful return of XLogPageRead() the
11705  * caller finds the record it's interested in to be broken, it should
11706  * ereport the error with the level determined by
11707  * emode_for_corrupt_record(), and then set lastSourceFailed
11708  * and call XLogPageRead() again with the same arguments. This lets
11709  * XLogPageRead() to try fetching the record from another source, or to
11710  * sleep and retry.
11711  */
11712 static int
11713 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11714                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11715 {
11716         XLogPageReadPrivate *private =
11717         (XLogPageReadPrivate *) xlogreader->private_data;
11718         int                     emode = private->emode;
11719         uint32          targetPageOff;
11720         XLogSegNo       targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11721         int                     r;
11722
11723         XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
11724         targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
11725
11726         /*
11727          * See if we need to switch to a new segment because the requested record
11728          * is not in the currently open one.
11729          */
11730         if (readFile >= 0 &&
11731                 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
11732         {
11733                 /*
11734                  * Request a restartpoint if we've replayed too much xlog since the
11735                  * last one.
11736                  */
11737                 if (bgwriterLaunched)
11738                 {
11739                         if (XLogCheckpointNeeded(readSegNo))
11740                         {
11741                                 (void) GetRedoRecPtr();
11742                                 if (XLogCheckpointNeeded(readSegNo))
11743                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11744                         }
11745                 }
11746
11747                 close(readFile);
11748                 readFile = -1;
11749                 readSource = 0;
11750         }
11751
11752         XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
11753
11754 retry:
11755         /* See if we need to retrieve more data */
11756         if (readFile < 0 ||
11757                 (readSource == XLOG_FROM_STREAM &&
11758                  receivedUpto < targetPagePtr + reqLen))
11759         {
11760                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11761                                                                                  private->randAccess,
11762                                                                                  private->fetching_ckpt,
11763                                                                                  targetRecPtr))
11764                 {
11765                         if (readFile >= 0)
11766                                 close(readFile);
11767                         readFile = -1;
11768                         readLen = 0;
11769                         readSource = 0;
11770
11771                         return -1;
11772                 }
11773         }
11774
11775         /*
11776          * At this point, we have the right segment open and if we're streaming we
11777          * know the requested record is in it.
11778          */
11779         Assert(readFile != -1);
11780
11781         /*
11782          * If the current segment is being streamed from master, calculate how
11783          * much of the current page we have received already. We know the
11784          * requested record has been received, but this is for the benefit of
11785          * future calls, to allow quick exit at the top of this function.
11786          */
11787         if (readSource == XLOG_FROM_STREAM)
11788         {
11789                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11790                         readLen = XLOG_BLCKSZ;
11791                 else
11792                         readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
11793                                 targetPageOff;
11794         }
11795         else
11796                 readLen = XLOG_BLCKSZ;
11797
11798         /* Read the requested page */
11799         readOff = targetPageOff;
11800         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11801         {
11802                 char            fname[MAXFNAMELEN];
11803                 int                     save_errno = errno;
11804
11805                 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11806                 errno = save_errno;
11807                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11808                                 (errcode_for_file_access(),
11809                                  errmsg("could not seek in log segment %s to offset %u: %m",
11810                                                 fname, readOff)));
11811                 goto next_record_is_invalid;
11812         }
11813
11814         pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11815         r = read(readFile, readBuf, XLOG_BLCKSZ);
11816         if (r != XLOG_BLCKSZ)
11817         {
11818                 char            fname[MAXFNAMELEN];
11819                 int                     save_errno = errno;
11820
11821                 pgstat_report_wait_end();
11822                 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11823                 if (r < 0)
11824                 {
11825                         errno = save_errno;
11826                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11827                                         (errcode_for_file_access(),
11828                                          errmsg("could not read from log segment %s, offset %u: %m",
11829                                                         fname, readOff)));
11830                 }
11831                 else
11832                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11833                                         (errcode(ERRCODE_DATA_CORRUPTED),
11834                                          errmsg("could not read from log segment %s, offset %u: read %d of %zu",
11835                                                         fname, readOff, r, (Size) XLOG_BLCKSZ)));
11836                 goto next_record_is_invalid;
11837         }
11838         pgstat_report_wait_end();
11839
11840         Assert(targetSegNo == readSegNo);
11841         Assert(targetPageOff == readOff);
11842         Assert(reqLen <= readLen);
11843
11844         *readTLI = curFileTLI;
11845
11846         /*
11847          * Check the page header immediately, so that we can retry immediately if
11848          * it's not valid. This may seem unnecessary, because XLogReadRecord()
11849          * validates the page header anyway, and would propagate the failure up to
11850          * ReadRecord(), which would retry. However, there's a corner case with
11851          * continuation records, if a record is split across two pages such that
11852          * we would need to read the two pages from different sources. For
11853          * example, imagine a scenario where a streaming replica is started up,
11854          * and replay reaches a record that's split across two WAL segments. The
11855          * first page is only available locally, in pg_wal, because it's already
11856          * been recycled in the master. The second page, however, is not present
11857          * in pg_wal, and we should stream it from the master. There is a recycled
11858          * WAL segment present in pg_wal, with garbage contents, however. We would
11859          * read the first page from the local WAL segment, but when reading the
11860          * second page, we would read the bogus, recycled, WAL segment. If we
11861          * didn't catch that case here, we would never recover, because
11862          * ReadRecord() would retry reading the whole record from the beginning.
11863          *
11864          * Of course, this only catches errors in the page header, which is what
11865          * happens in the case of a recycled WAL segment. Other kinds of errors or
11866          * corruption still has the same problem. But this at least fixes the
11867          * common case, which can happen as part of normal operation.
11868          *
11869          * Validating the page header is cheap enough that doing it twice
11870          * shouldn't be a big deal from a performance point of view.
11871          */
11872         if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11873         {
11874                 /* reset any error XLogReaderValidatePageHeader() might have set */
11875                 xlogreader->errormsg_buf[0] = '\0';
11876                 goto next_record_is_invalid;
11877         }
11878
11879         return readLen;
11880
11881 next_record_is_invalid:
11882         lastSourceFailed = true;
11883
11884         if (readFile >= 0)
11885                 close(readFile);
11886         readFile = -1;
11887         readLen = 0;
11888         readSource = 0;
11889
11890         /* In standby-mode, keep trying */
11891         if (StandbyMode)
11892                 goto retry;
11893         else
11894                 return -1;
11895 }
11896
11897 /*
11898  * Open the WAL segment containing WAL location 'RecPtr'.
11899  *
11900  * The segment can be fetched via restore_command, or via walreceiver having
11901  * streamed the record, or it can already be present in pg_wal. Checking
11902  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11903  * too, in case someone copies a new segment directly to pg_wal. That is not
11904  * documented or recommended, though.
11905  *
11906  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11907  * prepare to read WAL starting from RedoStartLSN after this.
11908  *
11909  * 'RecPtr' might not point to the beginning of the record we're interested
11910  * in, it might also point to the page or segment header. In that case,
11911  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11912  * used to decide which timeline to stream the requested WAL from.
11913  *
11914  * If the record is not immediately available, the function returns false
11915  * if we're not in standby mode. In standby mode, waits for it to become
11916  * available.
11917  *
11918  * When the requested record becomes available, the function opens the file
11919  * containing it (if not open already), and returns true. When end of standby
11920  * mode is triggered by the user, and there is no more WAL available, returns
11921  * false.
11922  */
11923 static bool
11924 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11925                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11926 {
11927         static TimestampTz last_fail_time = 0;
11928         TimestampTz now;
11929         bool            streaming_reply_sent = false;
11930
11931         /*-------
11932          * Standby mode is implemented by a state machine:
11933          *
11934          * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11935          *        pg_wal (XLOG_FROM_PG_WAL)
11936          * 2. Check trigger file
11937          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11938          * 4. Rescan timelines
11939          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11940          *
11941          * Failure to read from the current source advances the state machine to
11942          * the next state.
11943          *
11944          * 'currentSource' indicates the current state. There are no currentSource
11945          * values for "check trigger", "rescan timelines", and "sleep" states,
11946          * those actions are taken when reading from the previous source fails, as
11947          * part of advancing to the next state.
11948          *-------
11949          */
11950         if (!InArchiveRecovery)
11951                 currentSource = XLOG_FROM_PG_WAL;
11952         else if (currentSource == 0)
11953                 currentSource = XLOG_FROM_ARCHIVE;
11954
11955         for (;;)
11956         {
11957                 int                     oldSource = currentSource;
11958
11959                 /*
11960                  * First check if we failed to read from the current source, and
11961                  * advance the state machine if so. The failure to read might've
11962                  * happened outside this function, e.g when a CRC check fails on a
11963                  * record, or within this loop.
11964                  */
11965                 if (lastSourceFailed)
11966                 {
11967                         switch (currentSource)
11968                         {
11969                                 case XLOG_FROM_ARCHIVE:
11970                                 case XLOG_FROM_PG_WAL:
11971
11972                                         /*
11973                                          * Check to see if the trigger file exists. Note that we
11974                                          * do this only after failure, so when you create the
11975                                          * trigger file, we still finish replaying as much as we
11976                                          * can from archive and pg_wal before failover.
11977                                          */
11978                                         if (StandbyMode && CheckForStandbyTrigger())
11979                                         {
11980                                                 ShutdownWalRcv();
11981                                                 return false;
11982                                         }
11983
11984                                         /*
11985                                          * Not in standby mode, and we've now tried the archive
11986                                          * and pg_wal.
11987                                          */
11988                                         if (!StandbyMode)
11989                                                 return false;
11990
11991                                         /*
11992                                          * If primary_conninfo is set, launch walreceiver to try
11993                                          * to stream the missing WAL.
11994                                          *
11995                                          * If fetching_ckpt is true, RecPtr points to the initial
11996                                          * checkpoint location. In that case, we use RedoStartLSN
11997                                          * as the streaming start position instead of RecPtr, so
11998                                          * that when we later jump backwards to start redo at
11999                                          * RedoStartLSN, we will have the logs streamed already.
12000                                          */
12001                                         if (PrimaryConnInfo)
12002                                         {
12003                                                 XLogRecPtr      ptr;
12004                                                 TimeLineID      tli;
12005
12006                                                 if (fetching_ckpt)
12007                                                 {
12008                                                         ptr = RedoStartLSN;
12009                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
12010                                                 }
12011                                                 else
12012                                                 {
12013                                                         ptr = RecPtr;
12014
12015                                                         /*
12016                                                          * Use the record begin position to determine the
12017                                                          * TLI, rather than the position we're reading.
12018                                                          */
12019                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12020
12021                                                         if (curFileTLI > 0 && tli < curFileTLI)
12022                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12023                                                                          (uint32) (tliRecPtr >> 32),
12024                                                                          (uint32) tliRecPtr,
12025                                                                          tli, curFileTLI);
12026                                                 }
12027                                                 curFileTLI = tli;
12028                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12029                                                                                          PrimarySlotName);
12030                                                 receivedUpto = 0;
12031                                         }
12032
12033                                         /*
12034                                          * Move to XLOG_FROM_STREAM state in either case. We'll
12035                                          * get immediate failure if we didn't launch walreceiver,
12036                                          * and move on to the next state.
12037                                          */
12038                                         currentSource = XLOG_FROM_STREAM;
12039                                         break;
12040
12041                                 case XLOG_FROM_STREAM:
12042
12043                                         /*
12044                                          * Failure while streaming. Most likely, we got here
12045                                          * because streaming replication was terminated, or
12046                                          * promotion was triggered. But we also get here if we
12047                                          * find an invalid record in the WAL streamed from master,
12048                                          * in which case something is seriously wrong. There's
12049                                          * little chance that the problem will just go away, but
12050                                          * PANIC is not good for availability either, especially
12051                                          * in hot standby mode. So, we treat that the same as
12052                                          * disconnection, and retry from archive/pg_wal again. The
12053                                          * WAL in the archive should be identical to what was
12054                                          * streamed, so it's unlikely that it helps, but one can
12055                                          * hope...
12056                                          */
12057
12058                                         /*
12059                                          * Before we leave XLOG_FROM_STREAM state, make sure that
12060                                          * walreceiver is not active, so that it won't overwrite
12061                                          * WAL that we restore from archive.
12062                                          */
12063                                         if (WalRcvStreaming())
12064                                                 ShutdownWalRcv();
12065
12066                                         /*
12067                                          * Before we sleep, re-scan for possible new timelines if
12068                                          * we were requested to recover to the latest timeline.
12069                                          */
12070                                         if (recoveryTargetIsLatest)
12071                                         {
12072                                                 if (rescanLatestTimeLine())
12073                                                 {
12074                                                         currentSource = XLOG_FROM_ARCHIVE;
12075                                                         break;
12076                                                 }
12077                                         }
12078
12079                                         /*
12080                                          * XLOG_FROM_STREAM is the last state in our state
12081                                          * machine, so we've exhausted all the options for
12082                                          * obtaining the requested WAL. We're going to loop back
12083                                          * and retry from the archive, but if it hasn't been long
12084                                          * since last attempt, sleep wal_retrieve_retry_interval
12085                                          * milliseconds to avoid busy-waiting.
12086                                          */
12087                                         now = GetCurrentTimestamp();
12088                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
12089                                                                                                         wal_retrieve_retry_interval))
12090                                         {
12091                                                 long            secs,
12092                                                                         wait_time;
12093                                                 int                     usecs;
12094
12095                                                 TimestampDifference(last_fail_time, now, &secs, &usecs);
12096                                                 wait_time = wal_retrieve_retry_interval -
12097                                                         (secs * 1000 + usecs / 1000);
12098
12099                                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
12100                                                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12101                                                                   wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
12102                                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12103                                                 now = GetCurrentTimestamp();
12104                                         }
12105                                         last_fail_time = now;
12106                                         currentSource = XLOG_FROM_ARCHIVE;
12107                                         break;
12108
12109                                 default:
12110                                         elog(ERROR, "unexpected WAL source %d", currentSource);
12111                         }
12112                 }
12113                 else if (currentSource == XLOG_FROM_PG_WAL)
12114                 {
12115                         /*
12116                          * We just successfully read a file in pg_wal. We prefer files in
12117                          * the archive over ones in pg_wal, so try the next file again
12118                          * from the archive first.
12119                          */
12120                         if (InArchiveRecovery)
12121                                 currentSource = XLOG_FROM_ARCHIVE;
12122                 }
12123
12124                 if (currentSource != oldSource)
12125                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
12126                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12127                                  lastSourceFailed ? "failure" : "success");
12128
12129                 /*
12130                  * We've now handled possible failure. Try to read from the chosen
12131                  * source.
12132                  */
12133                 lastSourceFailed = false;
12134
12135                 switch (currentSource)
12136                 {
12137                         case XLOG_FROM_ARCHIVE:
12138                         case XLOG_FROM_PG_WAL:
12139                                 /* Close any old file we might have open. */
12140                                 if (readFile >= 0)
12141                                 {
12142                                         close(readFile);
12143                                         readFile = -1;
12144                                 }
12145                                 /* Reset curFileTLI if random fetch. */
12146                                 if (randAccess)
12147                                         curFileTLI = 0;
12148
12149                                 /*
12150                                  * Try to restore the file from archive, or read an existing
12151                                  * file from pg_wal.
12152                                  */
12153                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12154                                                                                           currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12155                                                                                           currentSource);
12156                                 if (readFile >= 0)
12157                                         return true;    /* success! */
12158
12159                                 /*
12160                                  * Nope, not found in archive or pg_wal.
12161                                  */
12162                                 lastSourceFailed = true;
12163                                 break;
12164
12165                         case XLOG_FROM_STREAM:
12166                                 {
12167                                         bool            havedata;
12168
12169                                         /*
12170                                          * Check if WAL receiver is still active.
12171                                          */
12172                                         if (!WalRcvStreaming())
12173                                         {
12174                                                 lastSourceFailed = true;
12175                                                 break;
12176                                         }
12177
12178                                         /*
12179                                          * Walreceiver is active, so see if new data has arrived.
12180                                          *
12181                                          * We only advance XLogReceiptTime when we obtain fresh
12182                                          * WAL from walreceiver and observe that we had already
12183                                          * processed everything before the most recent "chunk"
12184                                          * that it flushed to disk.  In steady state where we are
12185                                          * keeping up with the incoming data, XLogReceiptTime will
12186                                          * be updated on each cycle. When we are behind,
12187                                          * XLogReceiptTime will not advance, so the grace time
12188                                          * allotted to conflicting queries will decrease.
12189                                          */
12190                                         if (RecPtr < receivedUpto)
12191                                                 havedata = true;
12192                                         else
12193                                         {
12194                                                 XLogRecPtr      latestChunkStart;
12195
12196                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12197                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12198                                                 {
12199                                                         havedata = true;
12200                                                         if (latestChunkStart <= RecPtr)
12201                                                         {
12202                                                                 XLogReceiptTime = GetCurrentTimestamp();
12203                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
12204                                                         }
12205                                                 }
12206                                                 else
12207                                                         havedata = false;
12208                                         }
12209                                         if (havedata)
12210                                         {
12211                                                 /*
12212                                                  * Great, streamed far enough.  Open the file if it's
12213                                                  * not open already.  Also read the timeline history
12214                                                  * file if we haven't initialized timeline history
12215                                                  * yet; it should be streamed over and present in
12216                                                  * pg_wal by now.  Use XLOG_FROM_STREAM so that source
12217                                                  * info is set correctly and XLogReceiptTime isn't
12218                                                  * changed.
12219                                                  */
12220                                                 if (readFile < 0)
12221                                                 {
12222                                                         if (!expectedTLEs)
12223                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
12224                                                         readFile = XLogFileRead(readSegNo, PANIC,
12225                                                                                                         receiveTLI,
12226                                                                                                         XLOG_FROM_STREAM, false);
12227                                                         Assert(readFile >= 0);
12228                                                 }
12229                                                 else
12230                                                 {
12231                                                         /* just make sure source info is correct... */
12232                                                         readSource = XLOG_FROM_STREAM;
12233                                                         XLogReceiptSource = XLOG_FROM_STREAM;
12234                                                         return true;
12235                                                 }
12236                                                 break;
12237                                         }
12238
12239                                         /*
12240                                          * Data not here yet. Check for trigger, then wait for
12241                                          * walreceiver to wake us up when new WAL arrives.
12242                                          */
12243                                         if (CheckForStandbyTrigger())
12244                                         {
12245                                                 /*
12246                                                  * Note that we don't "return false" immediately here.
12247                                                  * After being triggered, we still want to replay all
12248                                                  * the WAL that was already streamed. It's in pg_wal
12249                                                  * now, so we just treat this as a failure, and the
12250                                                  * state machine will move on to replay the streamed
12251                                                  * WAL from pg_wal, and then recheck the trigger and
12252                                                  * exit replay.
12253                                                  */
12254                                                 lastSourceFailed = true;
12255                                                 break;
12256                                         }
12257
12258                                         /*
12259                                          * Since we have replayed everything we have received so
12260                                          * far and are about to start waiting for more WAL, let's
12261                                          * tell the upstream server our replay location now so
12262                                          * that pg_stat_replication doesn't show stale
12263                                          * information.
12264                                          */
12265                                         if (!streaming_reply_sent)
12266                                         {
12267                                                 WalRcvForceReply();
12268                                                 streaming_reply_sent = true;
12269                                         }
12270
12271                                         /*
12272                                          * Wait for more WAL to arrive. Time out after 5 seconds
12273                                          * to react to a trigger file promptly.
12274                                          */
12275                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
12276                                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12277                                                           5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
12278                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
12279                                         break;
12280                                 }
12281
12282                         default:
12283                                 elog(ERROR, "unexpected WAL source %d", currentSource);
12284                 }
12285
12286                 /*
12287                  * This possibly-long loop needs to handle interrupts of startup
12288                  * process.
12289                  */
12290                 HandleStartupProcInterrupts();
12291         }
12292
12293         return false;                           /* not reached */
12294 }
12295
12296 /*
12297  * Determine what log level should be used to report a corrupt WAL record
12298  * in the current WAL page, previously read by XLogPageRead().
12299  *
12300  * 'emode' is the error mode that would be used to report a file-not-found
12301  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
12302  * we're retrying the exact same record that we've tried previously, only
12303  * complain the first time to keep the noise down.  However, we only do when
12304  * reading from pg_wal, because we don't expect any invalid records in archive
12305  * or in records streamed from master. Files in the archive should be complete,
12306  * and we should never hit the end of WAL because we stop and wait for more WAL
12307  * to arrive before replaying it.
12308  *
12309  * NOTE: This function remembers the RecPtr value it was last called with,
12310  * to suppress repeated messages about the same record. Only call this when
12311  * you are about to ereport(), or you might cause a later message to be
12312  * erroneously suppressed.
12313  */
12314 static int
12315 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12316 {
12317         static XLogRecPtr lastComplaint = 0;
12318
12319         if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12320         {
12321                 if (RecPtr == lastComplaint)
12322                         emode = DEBUG1;
12323                 else
12324                         lastComplaint = RecPtr;
12325         }
12326         return emode;
12327 }
12328
12329 /*
12330  * Check to see whether the user-specified trigger file exists and whether a
12331  * promote request has arrived.  If either condition holds, return true.
12332  */
12333 static bool
12334 CheckForStandbyTrigger(void)
12335 {
12336         struct stat stat_buf;
12337         static bool triggered = false;
12338
12339         if (triggered)
12340                 return true;
12341
12342         if (IsPromoteTriggered())
12343         {
12344                 /*
12345                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12346                  * signal handler. It now leaves the file in place and lets the
12347                  * Startup process do the unlink. This allows Startup to know whether
12348                  * it should create a full checkpoint before starting up (fallback
12349                  * mode). Fast promotion takes precedence.
12350                  */
12351                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12352                 {
12353                         unlink(PROMOTE_SIGNAL_FILE);
12354                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12355                         fast_promote = true;
12356                 }
12357                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12358                 {
12359                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12360                         fast_promote = false;
12361                 }
12362
12363                 ereport(LOG, (errmsg("received promote request")));
12364
12365                 ResetPromoteTriggered();
12366                 triggered = true;
12367                 return true;
12368         }
12369
12370         if (TriggerFile == NULL)
12371                 return false;
12372
12373         if (stat(TriggerFile, &stat_buf) == 0)
12374         {
12375                 ereport(LOG,
12376                                 (errmsg("trigger file found: %s", TriggerFile)));
12377                 unlink(TriggerFile);
12378                 triggered = true;
12379                 fast_promote = true;
12380                 return true;
12381         }
12382         else if (errno != ENOENT)
12383                 ereport(ERROR,
12384                                 (errcode_for_file_access(),
12385                                  errmsg("could not stat trigger file \"%s\": %m",
12386                                                 TriggerFile)));
12387
12388         return false;
12389 }
12390
12391 /*
12392  * Remove the files signaling a standby promotion request.
12393  */
12394 void
12395 RemovePromoteSignalFiles(void)
12396 {
12397         unlink(PROMOTE_SIGNAL_FILE);
12398         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12399 }
12400
12401 /*
12402  * Check to see if a promote request has arrived. Should be
12403  * called by postmaster after receiving SIGUSR1.
12404  */
12405 bool
12406 CheckPromoteSignal(void)
12407 {
12408         struct stat stat_buf;
12409
12410         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12411                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12412                 return true;
12413
12414         return false;
12415 }
12416
12417 /*
12418  * Wake up startup process to replay newly arrived WAL, or to notice that
12419  * failover has been requested.
12420  */
12421 void
12422 WakeupRecovery(void)
12423 {
12424         SetLatch(&XLogCtl->recoveryWakeupLatch);
12425 }
12426
12427 /*
12428  * Update the WalWriterSleeping flag.
12429  */
12430 void
12431 SetWalWriterSleeping(bool sleeping)
12432 {
12433         SpinLockAcquire(&XLogCtl->info_lck);
12434         XLogCtl->WalWriterSleeping = sleeping;
12435         SpinLockRelease(&XLogCtl->info_lck);
12436 }
12437
12438 /*
12439  * Schedule a walreceiver wakeup in the main recovery loop.
12440  */
12441 void
12442 XLogRequestWalReceiverReply(void)
12443 {
12444         doRequestWalReceiverReply = true;
12445 }