granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/subtrans.h"
  27 #include "access/timeline.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogutils.h"
  34 #include "catalog/catversion.h"
  35 #include "catalog/pg_control.h"
  36 #include "catalog/pg_database.h"
  37 #include "libpq/pqsignal.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "postmaster/startup.h"
  42 #include "replication/walreceiver.h"
  43 #include "replication/walsender.h"
  44 #include "storage/bufmgr.h"
  45 #include "storage/fd.h"
  46 #include "storage/ipc.h"
  47 #include "storage/latch.h"
  48 #include "storage/pmsignal.h"
  49 #include "storage/predicate.h"
  50 #include "storage/proc.h"
  51 #include "storage/procarray.h"
  52 #include "storage/reinit.h"
  53 #include "storage/smgr.h"
  54 #include "storage/spin.h"
  55 #include "utils/builtins.h"
  56 #include "utils/guc.h"
  57 #include "utils/ps_status.h"
  58 #include "utils/relmapper.h"
  59 #include "utils/snapmgr.h"
  60 #include "utils/timestamp.h"
  61 #include "pg_trace.h"
  62
  63
  64 /* File path names (all relative to $PGDATA) */
  65 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  66 #define RECOVERY_COMMAND_DONE   "recovery.done"
  67 #define PROMOTE_SIGNAL_FILE "promote"
  68
  69
  70 /* User-settable parameters */
  71 int                     CheckPointSegments = 3;
  72 int                     wal_keep_segments = 0;
  73 int                     XLOGbuffers = -1;
  74 int                     XLogArchiveTimeout = 0;
  75 bool            XLogArchiveMode = false;
  76 char       *XLogArchiveCommand = NULL;
  77 bool            EnableHotStandby = false;
  78 bool            fullPageWrites = true;
  79 bool            log_checkpoints = false;
  80 int                     sync_method = DEFAULT_SYNC_METHOD;
  81 int                     wal_level = WAL_LEVEL_MINIMAL;
  82 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  83 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  84
  85 #ifdef WAL_DEBUG
  86 bool            XLOG_DEBUG = false;
  87 #endif
  88
  89 /*
  90  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  91  * When we are done with an old XLOG segment file, we will recycle it as a
  92  * future XLOG segment as long as there aren't already XLOGfileslop future
  93  * segments; else we'll delete it.  This could be made a separate GUC
  94  * variable, but at present I think it's sufficient to hardwire it as
  95  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  96  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  97  * of them; the +1 allows boundary cases to happen without wasting a
  98  * delete/create-segment cycle.
  99  */
 100 #define XLOGfileslop    (2*CheckPointSegments + 1)
 101
 102
 103 /*
 104  * GUC support
 105  */
 106 const struct config_enum_entry sync_method_options[] = {
 107         {"fsync", SYNC_METHOD_FSYNC, false},
 108 #ifdef HAVE_FSYNC_WRITETHROUGH
 109         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 110 #endif
 111 #ifdef HAVE_FDATASYNC
 112         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 113 #endif
 114 #ifdef OPEN_SYNC_FLAG
 115         {"open_sync", SYNC_METHOD_OPEN, false},
 116 #endif
 117 #ifdef OPEN_DATASYNC_FLAG
 118         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 119 #endif
 120         {NULL, 0, false}
 121 };
 122
 123 /*
 124  * Statistics for current checkpoint are collected in this global struct.
 125  * Because only the background writer or a stand-alone backend can perform
 126  * checkpoints, this will be unused in normal backends.
 127  */
 128 CheckpointStatsData CheckpointStats;
 129
 130 /*
 131  * ThisTimeLineID will be same in all backends --- it identifies current
 132  * WAL timeline for the database system.
 133  */
 134 TimeLineID      ThisTimeLineID = 0;
 135
 136 /*
 137  * Are we doing recovery from XLOG?
 138  *
 139  * This is only ever true in the startup process; it should be read as meaning
 140  * "this process is replaying WAL records", rather than "the system is in
 141  * recovery mode".  It should be examined primarily by functions that need
 142  * to act differently when called from a WAL redo function (e.g., to skip WAL
 143  * logging).  To check whether the system is in recovery regardless of which
 144  * process you're running in, use RecoveryInProgress() but only after shared
 145  * memory startup and lock initialization.
 146  */
 147 bool            InRecovery = false;
 148
 149 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 150 HotStandbyState standbyState = STANDBY_DISABLED;
 151
 152 static XLogRecPtr LastRec;
 153
 154 /* Local copy of WalRcv->receivedUpto */
 155 static XLogRecPtr receivedUpto = 0;
 156
 157 /*
 158  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 159  * the replayed WAL records indicate. It's initialized with full_page_writes
 160  * that the recovery starting checkpoint record indicates, and then updated
 161  * each time XLOG_FPW_CHANGE record is replayed.
 162  */
 163 static bool lastFullPageWrites;
 164
 165 /*
 166  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 167  * known, need to check the shared state".
 168  */
 169 static bool LocalRecoveryInProgress = true;
 170
 171 /*
 172  * Local copy of SharedHotStandbyActive variable. False actually means "not
 173  * known, need to check the shared state".
 174  */
 175 static bool LocalHotStandbyActive = false;
 176
 177 /*
 178  * Local state for XLogInsertAllowed():
 179  *              1: unconditionally allowed to insert XLOG
 180  *              0: unconditionally not allowed to insert XLOG
 181  *              -1: must check RecoveryInProgress(); disallow until it is false
 182  * Most processes start with -1 and transition to 1 after seeing that recovery
 183  * is not in progress.  But we can also force the value for special cases.
 184  * The coding in XLogInsertAllowed() depends on the first two of these states
 185  * being numerically the same as bool true and false.
 186  */
 187 static int      LocalXLogInsertAllowed = -1;
 188
 189 /* Are we recovering using offline XLOG archives? (only valid in the startup process) */
 190 bool InArchiveRecovery = false;
 191
 192 /* Was the last xlog file restored from archive, or local? */
 193 static bool restoredFromArchive = false;
 194
 195 /* options taken from recovery.conf for archive recovery */
 196 char *recoveryRestoreCommand = NULL;
 197 static char *recoveryEndCommand = NULL;
 198 static char *archiveCleanupCommand = NULL;
 199 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 200 static bool recoveryTargetInclusive = true;
 201 static bool recoveryPauseAtTarget = true;
 202 static TransactionId recoveryTargetXid;
 203 static TimestampTz recoveryTargetTime;
 204 static char *recoveryTargetName;
 205
 206 /* options taken from recovery.conf for XLOG streaming */
 207 bool StandbyMode = false;
 208 static char *PrimaryConnInfo = NULL;
 209 static char *TriggerFile = NULL;
 210
 211 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 212 static TransactionId recoveryStopXid;
 213 static TimestampTz recoveryStopTime;
 214 static char recoveryStopName[MAXFNAMELEN];
 215 static bool recoveryStopAfter;
 216
 217 /*
 218  * During normal operation, the only timeline we care about is ThisTimeLineID.
 219  * During recovery, however, things are more complicated.  To simplify life
 220  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 221  * scan through the WAL history (that is, it is the line that was active when
 222  * the currently-scanned WAL record was generated).  We also need these
 223  * timeline values:
 224  *
 225  * recoveryTargetTLI: the desired timeline that we want to end in.
 226  *
 227  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 228  *
 229  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 230  * its known parents, newest first (so recoveryTargetTLI is always the
 231  * first list member).  Only these TLIs are expected to be seen in the WAL
 232  * segments we read, and indeed only these TLIs will be considered as
 233  * candidate WAL files to open at all.
 234  *
 235  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 236  * (This is not necessarily the same as ThisTimeLineID, because we could
 237  * be scanning data that was copied from an ancestor timeline when the current
 238  * file was created.)  During a sequential scan we do not allow this value
 239  * to decrease.
 240  */
 241 static TimeLineID recoveryTargetTLI;
 242 static bool recoveryTargetIsLatest = false;
 243 static List *expectedTLEs;
 244 static TimeLineID curFileTLI;
 245
 246 /*
 247  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 248  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 249  * end+1 of the last record, and is reset when we end a top-level transaction,
 250  * or start a new one; so it can be used to tell if the current transaction has
 251  * created any XLOG records.
 252  */
 253 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 254
 255 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 256
 257 /*
 258  * RedoRecPtr is this backend's local copy of the REDO record pointer
 259  * (which is almost but not quite the same as a pointer to the most recent
 260  * CHECKPOINT record).  We update this from the shared-memory copy,
 261  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 262  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 263  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 264  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 265  * InitXLOGAccess.
 266  */
 267 static XLogRecPtr RedoRecPtr;
 268
 269 /*
 270  * RedoStartLSN points to the checkpoint's REDO location which is specified
 271  * in a backup label file, backup history file or control file. In standby
 272  * mode, XLOG streaming usually starts from the position where an invalid
 273  * record was found. But if we fail to read even the initial checkpoint
 274  * record, we use the REDO location instead of the checkpoint location as
 275  * the start position of XLOG streaming. Otherwise we would have to jump
 276  * backwards to the REDO location after reading the checkpoint record,
 277  * because the REDO record can precede the checkpoint record.
 278  */
 279 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 280
 281 /*----------
 282  * Shared-memory data structures for XLOG control
 283  *
 284  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 285  * the log up to (all records before that point must be written or fsynced).
 286  * LogwrtResult indicates the byte positions we have already written/fsynced.
 287  * These structs are identical but are declared separately to indicate their
 288  * slightly different functions.
 289  *
 290  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 291  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 292  * this arrangement is that the value can be examined by code that already
 293  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 294  * to the shared variable, each backend has a private copy of LogwrtResult,
 295  * which is updated when convenient.
 296  *
 297  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 298  * (protected by info_lck), but we don't need to cache any copies of it.
 299  *
 300  * info_lck is only held long enough to read/update the protected variables,
 301  * so it's a plain spinlock.  The other locks are held longer (potentially
 302  * over I/O operations), so we use LWLocks for them.  These locks are:
 303  *
 304  * WALInsertLock: must be held to insert a record into the WAL buffers.
 305  *
 306  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 307  * XLogFlush).
 308  *
 309  * ControlFileLock: must be held to read/update control file or create
 310  * new log file.
 311  *
 312  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 313  * only one checkpointer at a time; currently, with all checkpoints done by
 314  * the checkpointer, this is just pro forma).
 315  *
 316  *----------
 317  */
 318
 319 typedef struct XLogwrtRqst
 320 {
 321         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 322         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 323 } XLogwrtRqst;
 324
 325 typedef struct XLogwrtResult
 326 {
 327         XLogRecPtr      Write;                  /* last byte + 1 written out */
 328         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 329 } XLogwrtResult;
 330
 331 /*
 332  * Shared state data for XLogInsert.
 333  */
 334 typedef struct XLogCtlInsert
 335 {
 336         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 337         int                     curridx;                /* current block index in cache */
 338         XLogPageHeader currpage;        /* points to header of block in cache */
 339         char       *currpos;            /* current insertion point in cache */
 340         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 341         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 342
 343         /*
 344          * fullPageWrites is the master copy used by all backends to determine
 345          * whether to write full-page to WAL, instead of using process-local one.
 346          * This is required because, when full_page_writes is changed by SIGHUP,
 347          * we must WAL-log it before it actually affects WAL-logging by backends.
 348          * Checkpointer sets at startup or after SIGHUP.
 349          */
 350         bool            fullPageWrites;
 351
 352         /*
 353          * exclusiveBackup is true if a backup started with pg_start_backup() is
 354          * in progress, and nonExclusiveBackups is a counter indicating the number
 355          * of streaming base backups currently in progress. forcePageWrites is set
 356          * to true when either of these is non-zero. lastBackupStart is the latest
 357          * checkpoint redo location used as a starting point for an online backup.
 358          */
 359         bool            exclusiveBackup;
 360         int                     nonExclusiveBackups;
 361         XLogRecPtr      lastBackupStart;
 362 } XLogCtlInsert;
 363
 364 /*
 365  * Shared state data for XLogWrite/XLogFlush.
 366  */
 367 typedef struct XLogCtlWrite
 368 {
 369         int                     curridx;                /* cache index of next block to write */
 370         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 371 } XLogCtlWrite;
 372
 373 /*
 374  * Total shared-memory state for XLOG.
 375  */
 376 typedef struct XLogCtlData
 377 {
 378         /* Protected by WALInsertLock: */
 379         XLogCtlInsert Insert;
 380
 381         /* Protected by info_lck: */
 382         XLogwrtRqst LogwrtRqst;
 383         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 384         TransactionId ckptXid;
 385         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 386         XLogSegNo       lastRemovedSegNo; /* latest removed/recycled XLOG segment */
 387
 388         /* Protected by WALWriteLock: */
 389         XLogCtlWrite Write;
 390
 391         /*
 392          * Protected by info_lck and WALWriteLock (you must hold either lock to
 393          * read it, but both to update)
 394          */
 395         XLogwrtResult LogwrtResult;
 396
 397         /*
 398          * These values do not change after startup, although the pointed-to pages
 399          * and xlblocks values certainly do.  Permission to read/write the pages
 400          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 401          */
 402         char       *pages;                      /* buffers for unwritten XLOG pages */
 403         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 404         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 405         TimeLineID      ThisTimeLineID;
 406
 407         /*
 408          * archiveCleanupCommand is read from recovery.conf but needs to be in
 409          * shared memory so that the checkpointer process can access it.
 410          */
 411         char            archiveCleanupCommand[MAXPGPATH];
 412
 413         /*
 414          * SharedRecoveryInProgress indicates if we're still in crash or archive
 415          * recovery.  Protected by info_lck.
 416          */
 417         bool            SharedRecoveryInProgress;
 418
 419         /*
 420          * SharedHotStandbyActive indicates if we're still in crash or archive
 421          * recovery.  Protected by info_lck.
 422          */
 423         bool            SharedHotStandbyActive;
 424
 425         /*
 426          * WalWriterSleeping indicates whether the WAL writer is currently in
 427          * low-power mode (and hence should be nudged if an async commit occurs).
 428          * Protected by info_lck.
 429          */
 430         bool            WalWriterSleeping;
 431
 432         /*
 433          * recoveryWakeupLatch is used to wake up the startup process to continue
 434          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 435          * to appear.
 436          */
 437         Latch           recoveryWakeupLatch;
 438
 439         /*
 440          * During recovery, we keep a copy of the latest checkpoint record here.
 441          * Used by the background writer when it wants to create a restartpoint.
 442          *
 443          * Protected by info_lck.
 444          */
 445         XLogRecPtr      lastCheckPointRecPtr;
 446         CheckPoint      lastCheckPoint;
 447
 448         /* end+1 of the last record replayed (or being replayed) */
 449         XLogRecPtr      replayEndRecPtr;
 450         TimeLineID      replayEndTLI;
 451         /* end+1 of the last record replayed */
 452         XLogRecPtr      recoveryLastRecPtr;
 453         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 454         TimestampTz recoveryLastXTime;
 455         /* current effective recovery target timeline */
 456         TimeLineID      RecoveryTargetTLI;
 457
 458         /*
 459          * timestamp of when we started replaying the current chunk of WAL data,
 460          * only relevant for replication or archive recovery
 461          */
 462         TimestampTz currentChunkStartTime;
 463         /* Are we requested to pause recovery? */
 464         bool            recoveryPause;
 465
 466         /*
 467          * lastFpwDisableRecPtr points to the start of the last replayed
 468          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 469          */
 470         XLogRecPtr      lastFpwDisableRecPtr;
 471
 472         slock_t         info_lck;               /* locks shared variables shown above */
 473 } XLogCtlData;
 474
 475 static XLogCtlData *XLogCtl = NULL;
 476
 477 /*
 478  * We maintain an image of pg_control in shared memory.
 479  */
 480 static ControlFileData *ControlFile = NULL;
 481
 482 /*
 483  * Macros for managing XLogInsert state.  In most cases, the calling routine
 484  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 485  * so these are passed as parameters instead of being fetched via XLogCtl.
 486  */
 487
 488 /* Free space remaining in the current xlog page buffer */
 489 #define INSERT_FREESPACE(Insert)  \
 490         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 491
 492 /* Construct XLogRecPtr value for current insertion point */
 493 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 494                 (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
 495
 496 #define PrevBufIdx(idx)         \
 497                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 498
 499 #define NextBufIdx(idx)         \
 500                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 501
 502 /*
 503  * Private, possibly out-of-date copy of shared LogwrtResult.
 504  * See discussion above.
 505  */
 506 static XLogwrtResult LogwrtResult = {0, 0};
 507
 508 /*
 509  * Codes indicating where we got a WAL file from during recovery, or where
 510  * to attempt to get one.
 511  */
 512 typedef enum
 513 {
 514         XLOG_FROM_ANY = 0,              /* request to read WAL from any source */
 515         XLOG_FROM_ARCHIVE,              /* restored using restore_command */
 516         XLOG_FROM_PG_XLOG,              /* existing file in pg_xlog */
 517         XLOG_FROM_STREAM,               /* streamed from master */
 518 } XLogSource;
 519
 520 /* human-readable names for XLogSources, for debugging output */
 521 static const char *xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" };
 522
 523 /*
 524  * openLogFile is -1 or a kernel FD for an open log file segment.
 525  * When it's open, openLogOff is the current seek offset in the file.
 526  * openLogSegNo identifies the segment.  These variables are only
 527  * used to write the XLOG, and so will normally refer to the active segment.
 528  */
 529 static int      openLogFile = -1;
 530 static XLogSegNo openLogSegNo = 0;
 531 static uint32 openLogOff = 0;
 532
 533 /*
 534  * These variables are used similarly to the ones above, but for reading
 535  * the XLOG.  Note, however, that readOff generally represents the offset
 536  * of the page just read, not the seek position of the FD itself, which
 537  * will be just past that page. readLen indicates how much of the current
 538  * page has been read into readBuf, and readSource indicates where we got
 539  * the currently open file from.
 540  */
 541 static int      readFile = -1;
 542 static XLogSegNo readSegNo = 0;
 543 static uint32 readOff = 0;
 544 static uint32 readLen = 0;
 545 static bool     readFileHeaderValidated = false;
 546 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 547
 548 /*
 549  * Keeps track of which source we're currently reading from. This is
 550  * different from readSource in that this is always set, even when we don't
 551  * currently have a WAL file open. If lastSourceFailed is set, our last
 552  * attempt to read from currentSource failed, and we should try another source
 553  * next.
 554  */
 555 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 556 static bool     lastSourceFailed = false;
 557
 558 /*
 559  * These variables track when we last obtained some WAL data to process,
 560  * and where we got it from.  (XLogReceiptSource is initially the same as
 561  * readSource, but readSource gets reset to zero when we don't have data
 562  * to process right now.  It is also different from currentSource, which
 563  * also changes when we try to read from a source and fail, while
 564  * XLogReceiptSource tracks where we last successfully read some WAL.)
 565  */
 566 static TimestampTz XLogReceiptTime = 0;
 567 static XLogSource XLogReceiptSource = 0;        /* XLOG_FROM_* code */
 568
 569 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 570 static char *readBuf = NULL;
 571
 572 /* Buffer for current ReadRecord result (expandable) */
 573 static char *readRecordBuf = NULL;
 574 static uint32 readRecordBufSize = 0;
 575
 576 /* State information for XLOG reading */
 577 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 578 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 579 static TimeLineID lastPageTLI = 0;
 580 static TimeLineID lastSegmentTLI = 0;
 581
 582 static XLogRecPtr minRecoveryPoint;             /* local copy of
 583                                                                                  * ControlFile->minRecoveryPoint */
 584 static TimeLineID minRecoveryPointTLI;
 585 static bool updateMinRecoveryPoint = true;
 586
 587 /*
 588  * Have we reached a consistent database state? In crash recovery, we have
 589  * to replay all the WAL, so reachedConsistency is never set. During archive
 590  * recovery, the database is consistent once minRecoveryPoint is reached.
 591  */
 592 bool            reachedConsistency = false;
 593
 594 static bool InRedo = false;
 595
 596 /* Have we launched bgwriter during recovery? */
 597 static bool bgwriterLaunched = false;
 598
 599
 600 static void readRecoveryCommandFile(void);
 601 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 602 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 603 static void recoveryPausesHere(void);
 604 static void SetLatestXTime(TimestampTz xtime);
 605 static void SetCurrentChunkStartTime(TimestampTz xtime);
 606 static void CheckRequiredParameterValues(void);
 607 static void XLogReportParameters(void);
 608 static void LocalSetXLogInsertAllowed(void);
 609 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 610 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 611
 612 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 613                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 614 static bool AdvanceXLInsertBuffer(bool new_segment);
 615 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 616 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 617 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 618                                            bool find_free, int *max_advance,
 619                                            bool use_lock);
 620 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 621                          int source, bool notexistOk);
 622 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 623 static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 624                          bool randAccess);
 625 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 626                                                         bool fetching_ckpt);
 627 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 628 static void XLogFileClose(void);
 629 static void PreallocXlogFiles(XLogRecPtr endptr);
 630 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 631 static void UpdateLastRemovedPtr(char *filename);
 632 static void ValidateXLOGDirectoryStructure(void);
 633 static void CleanupBackupHistory(void);
 634 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 635 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
 636 static void CheckRecoveryConsistency(void);
 637 static bool ValidXLogPageHeader(XLogPageHeader hdr, int emode, bool segmentonly);
 638 static bool ValidXLogRecordHeader(XLogRecPtr *RecPtr, XLogRecord *record,
 639                                           int emode, bool randAccess);
 640 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 641 static bool rescanLatestTimeLine(void);
 642 static void WriteControlFile(void);
 643 static void ReadControlFile(void);
 644 static char *str_time(pg_time_t tnow);
 645 static bool CheckForStandbyTrigger(void);
 646
 647 #ifdef WAL_DEBUG
 648 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 649 #endif
 650 static void pg_start_backup_callback(int code, Datum arg);
 651 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 652                                   bool *backupEndRequired, bool *backupFromStandby);
 653 static void rm_redo_error_callback(void *arg);
 654 static int      get_sync_bit(int method);
 655
 656
 657 /*
 658  * Insert an XLOG record having the specified RMID and info bytes,
 659  * with the body of the record being the data chunk(s) described by
 660  * the rdata chain (see xlog.h for notes about rdata).
 661  *
 662  * Returns XLOG pointer to end of record (beginning of next record).
 663  * This can be used as LSN for data pages affected by the logged action.
 664  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 665  * before the data page can be written out.  This implements the basic
 666  * WAL rule "write the log before the data".)
 667  *
 668  * NB: this routine feels free to scribble on the XLogRecData structs,
 669  * though not on the data they reference.  This is OK since the XLogRecData
 670  * structs are always just temporaries in the calling code.
 671  */
 672 XLogRecPtr
 673 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 674 {
 675         XLogCtlInsert *Insert = &XLogCtl->Insert;
 676         XLogRecPtr      RecPtr;
 677         XLogRecPtr      WriteRqst;
 678         uint32          freespace;
 679         int                     curridx;
 680         XLogRecData *rdt;
 681         XLogRecData *rdt_lastnormal;
 682         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 683         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 684         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 685         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 686         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 687         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 688         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 689         XLogRecData hdr_rdt;
 690         pg_crc32        rdata_crc;
 691         uint32          len,
 692                                 write_len;
 693         unsigned        i;
 694         bool            updrqst;
 695         bool            doPageWrites;
 696         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 697         uint8           info_orig = info;
 698         static XLogRecord *rechdr;
 699
 700         if (rechdr == NULL)
 701         {
 702                 rechdr = malloc(SizeOfXLogRecord);
 703                 if (rechdr == NULL)
 704                         elog(ERROR, "out of memory");
 705                 MemSet(rechdr, 0, SizeOfXLogRecord);
 706         }
 707
 708         /* cross-check on whether we should be here or not */
 709         if (!XLogInsertAllowed())
 710                 elog(ERROR, "cannot make new WAL entries during recovery");
 711
 712         /* info's high bits are reserved for use by me */
 713         if (info & XLR_INFO_MASK)
 714                 elog(PANIC, "invalid xlog info mask %02X", info);
 715
 716         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 717
 718         /*
 719          * In bootstrap mode, we don't actually log anything but XLOG resources;
 720          * return a phony record pointer.
 721          */
 722         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 723         {
 724                 RecPtr = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 725                 return RecPtr;
 726         }
 727
 728         /*
 729          * Here we scan the rdata chain, to determine which buffers must be backed
 730          * up.
 731          *
 732          * We may have to loop back to here if a race condition is detected below.
 733          * We could prevent the race by doing all this work while holding the
 734          * insert lock, but it seems better to avoid doing CRC calculations while
 735          * holding the lock.
 736          *
 737          * We add entries for backup blocks to the chain, so that they don't need
 738          * any special treatment in the critical section where the chunks are
 739          * copied into the WAL buffers. Those entries have to be unlinked from the
 740          * chain if we have to loop back here.
 741          */
 742 begin:;
 743         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 744         {
 745                 dtbuf[i] = InvalidBuffer;
 746                 dtbuf_bkp[i] = false;
 747         }
 748
 749         /*
 750          * Decide if we need to do full-page writes in this XLOG record: true if
 751          * full_page_writes is on or we have a PITR request for it.  Since we
 752          * don't yet have the insert lock, fullPageWrites and forcePageWrites
 753          * could change under us, but we'll recheck them once we have the lock.
 754          */
 755         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 756
 757         len = 0;
 758         for (rdt = rdata;;)
 759         {
 760                 if (rdt->buffer == InvalidBuffer)
 761                 {
 762                         /* Simple data, just include it */
 763                         len += rdt->len;
 764                 }
 765                 else
 766                 {
 767                         /* Find info for buffer */
 768                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 769                         {
 770                                 if (rdt->buffer == dtbuf[i])
 771                                 {
 772                                         /* Buffer already referenced by earlier chain item */
 773                                         if (dtbuf_bkp[i])
 774                                         {
 775                                                 rdt->data = NULL;
 776                                                 rdt->len = 0;
 777                                         }
 778                                         else if (rdt->data)
 779                                                 len += rdt->len;
 780                                         break;
 781                                 }
 782                                 if (dtbuf[i] == InvalidBuffer)
 783                                 {
 784                                         /* OK, put it in this slot */
 785                                         dtbuf[i] = rdt->buffer;
 786                                         if (XLogCheckBuffer(rdt, doPageWrites,
 787                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 788                                         {
 789                                                 dtbuf_bkp[i] = true;
 790                                                 rdt->data = NULL;
 791                                                 rdt->len = 0;
 792                                         }
 793                                         else if (rdt->data)
 794                                                 len += rdt->len;
 795                                         break;
 796                                 }
 797                         }
 798                         if (i >= XLR_MAX_BKP_BLOCKS)
 799                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 800                                          XLR_MAX_BKP_BLOCKS);
 801                 }
 802                 /* Break out of loop when rdt points to last chain item */
 803                 if (rdt->next == NULL)
 804                         break;
 805                 rdt = rdt->next;
 806         }
 807
 808         /*
 809          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 810          * error checking in ReadRecord.  This means that all callers of
 811          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 812          * make an exception for XLOG SWITCH records because we don't want them to
 813          * ever cross a segment boundary.
 814          */
 815         if (len == 0 && !isLogSwitch)
 816                 elog(PANIC, "invalid xlog record length %u", len);
 817
 818         /*
 819          * Make additional rdata chain entries for the backup blocks, so that we
 820          * don't need to special-case them in the write loop.  This modifies the
 821          * original rdata chain, but we keep a pointer to the last regular entry,
 822          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 823          * beginning.
 824          *
 825          * At the exit of this loop, write_len includes the backup block data.
 826          *
 827          * Also set the appropriate info bits to show which buffers were backed
 828          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 829          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 830          */
 831         rdt_lastnormal = rdt;
 832         write_len = len;
 833         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 834         {
 835                 BkpBlock   *bkpb;
 836                 char       *page;
 837
 838                 if (!dtbuf_bkp[i])
 839                         continue;
 840
 841                 info |= XLR_BKP_BLOCK(i);
 842
 843                 bkpb = &(dtbuf_xlg[i]);
 844                 page = (char *) BufferGetBlock(dtbuf[i]);
 845
 846                 rdt->next = &(dtbuf_rdt1[i]);
 847                 rdt = rdt->next;
 848
 849                 rdt->data = (char *) bkpb;
 850                 rdt->len = sizeof(BkpBlock);
 851                 write_len += sizeof(BkpBlock);
 852
 853                 rdt->next = &(dtbuf_rdt2[i]);
 854                 rdt = rdt->next;
 855
 856                 if (bkpb->hole_length == 0)
 857                 {
 858                         rdt->data = page;
 859                         rdt->len = BLCKSZ;
 860                         write_len += BLCKSZ;
 861                         rdt->next = NULL;
 862                 }
 863                 else
 864                 {
 865                         /* must skip the hole */
 866                         rdt->data = page;
 867                         rdt->len = bkpb->hole_offset;
 868                         write_len += bkpb->hole_offset;
 869
 870                         rdt->next = &(dtbuf_rdt3[i]);
 871                         rdt = rdt->next;
 872
 873                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 874                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 875                         write_len += rdt->len;
 876                         rdt->next = NULL;
 877                 }
 878         }
 879
 880         /*
 881          * Calculate CRC of the data, including all the backup blocks
 882          *
 883          * Note that the record header isn't added into the CRC initially since we
 884          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
 885          * the whole record in the order: rdata, then backup blocks, then record
 886          * header.
 887          */
 888         INIT_CRC32(rdata_crc);
 889         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
 890                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 891
 892         /*
 893          * Construct record header (prev-link and CRC are filled in later), and
 894          * make that the first chunk in the chain.
 895          */
 896         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
 897         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
 898         rechdr->xl_len = len;           /* doesn't include backup blocks */
 899         rechdr->xl_info = info;
 900         rechdr->xl_rmid = rmid;
 901
 902         hdr_rdt.next = rdata;
 903         hdr_rdt.data = (char *) rechdr;
 904         hdr_rdt.len = SizeOfXLogRecord;
 905
 906         write_len += SizeOfXLogRecord;
 907
 908         START_CRIT_SECTION();
 909
 910         /* Now wait to get insert lock */
 911         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 912
 913         /*
 914          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 915          * back and recompute everything.  This can only happen just after a
 916          * checkpoint, so it's better to be slow in this case and fast otherwise.
 917          *
 918          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 919          * affect the contents of the XLOG record, so we'll update our local copy
 920          * but not force a recomputation.
 921          */
 922         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 923         {
 924                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 925                 RedoRecPtr = Insert->RedoRecPtr;
 926
 927                 if (doPageWrites)
 928                 {
 929                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 930                         {
 931                                 if (dtbuf[i] == InvalidBuffer)
 932                                         continue;
 933                                 if (dtbuf_bkp[i] == false &&
 934                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 935                                 {
 936                                         /*
 937                                          * Oops, this buffer now needs to be backed up, but we
 938                                          * didn't think so above.  Start over.
 939                                          */
 940                                         LWLockRelease(WALInsertLock);
 941                                         END_CRIT_SECTION();
 942                                         rdt_lastnormal->next = NULL;
 943                                         info = info_orig;
 944                                         goto begin;
 945                                 }
 946                         }
 947                 }
 948         }
 949
 950         /*
 951          * Also check to see if fullPageWrites or forcePageWrites was just turned
 952          * on; if we weren't already doing full-page writes then go back and
 953          * recompute. (If it was just turned off, we could recompute the record
 954          * without full pages, but we choose not to bother.)
 955          */
 956         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
 957         {
 958                 /* Oops, must redo it with full-page data. */
 959                 LWLockRelease(WALInsertLock);
 960                 END_CRIT_SECTION();
 961                 rdt_lastnormal->next = NULL;
 962                 info = info_orig;
 963                 goto begin;
 964         }
 965
 966         /*
 967          * If the current page is completely full, the record goes to the next
 968          * page, right after the page header.
 969          */
 970         updrqst = false;
 971         freespace = INSERT_FREESPACE(Insert);
 972         if (freespace == 0)
 973         {
 974                 updrqst = AdvanceXLInsertBuffer(false);
 975                 freespace = INSERT_FREESPACE(Insert);
 976         }
 977
 978         /* Compute record's XLOG location */
 979         curridx = Insert->curridx;
 980         INSERT_RECPTR(RecPtr, Insert, curridx);
 981
 982         /*
 983          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 984          * segment, we need not insert it (and don't want to because we'd like
 985          * consecutive switch requests to be no-ops).  Instead, make sure
 986          * everything is written and flushed through the end of the prior segment,
 987          * and return the prior segment's end address.
 988          */
 989         if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
 990         {
 991                 /* We can release insert lock immediately */
 992                 LWLockRelease(WALInsertLock);
 993
 994                 RecPtr -= SizeOfXLogLongPHD;
 995
 996                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 997                 LogwrtResult = XLogCtl->LogwrtResult;
 998                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 999                 {
1000                         XLogwrtRqst FlushRqst;
1001
1002                         FlushRqst.Write = RecPtr;
1003                         FlushRqst.Flush = RecPtr;
1004                         XLogWrite(FlushRqst, false, false);
1005                 }
1006                 LWLockRelease(WALWriteLock);
1007
1008                 END_CRIT_SECTION();
1009
1010                 /* wake up walsenders now that we've released heavily contended locks */
1011                 WalSndWakeupProcessRequests();
1012                 return RecPtr;
1013         }
1014
1015         /* Finish the record header */
1016         rechdr->xl_prev = Insert->PrevRecord;
1017
1018         /* Now we can finish computing the record's CRC */
1019         COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
1020         FIN_CRC32(rdata_crc);
1021         rechdr->xl_crc = rdata_crc;
1022
1023 #ifdef WAL_DEBUG
1024         if (XLOG_DEBUG)
1025         {
1026                 StringInfoData buf;
1027
1028                 initStringInfo(&buf);
1029                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1030                                                  (uint32) (RecPtr >> 32), (uint32) RecPtr);
1031                 xlog_outrec(&buf, rechdr);
1032                 if (rdata->data != NULL)
1033                 {
1034                         appendStringInfo(&buf, " - ");
1035                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1036                 }
1037                 elog(LOG, "%s", buf.data);
1038                 pfree(buf.data);
1039         }
1040 #endif
1041
1042         /* Record begin of record in appropriate places */
1043         ProcLastRecPtr = RecPtr;
1044         Insert->PrevRecord = RecPtr;
1045
1046         /*
1047          * Append the data, including backup blocks if any
1048          */
1049         rdata = &hdr_rdt;
1050         while (write_len)
1051         {
1052                 while (rdata->data == NULL)
1053                         rdata = rdata->next;
1054
1055                 if (freespace > 0)
1056                 {
1057                         if (rdata->len > freespace)
1058                         {
1059                                 memcpy(Insert->currpos, rdata->data, freespace);
1060                                 rdata->data += freespace;
1061                                 rdata->len -= freespace;
1062                                 write_len -= freespace;
1063                         }
1064                         else
1065                         {
1066                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1067                                 freespace -= rdata->len;
1068                                 write_len -= rdata->len;
1069                                 Insert->currpos += rdata->len;
1070                                 rdata = rdata->next;
1071                                 continue;
1072                         }
1073                 }
1074
1075                 /* Use next buffer */
1076                 updrqst = AdvanceXLInsertBuffer(false);
1077                 curridx = Insert->curridx;
1078                 /* Mark page header to indicate this record continues on the page */
1079                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1080                 Insert->currpage->xlp_rem_len = write_len;
1081                 freespace = INSERT_FREESPACE(Insert);
1082         }
1083
1084         /* Ensure next record will be properly aligned */
1085         Insert->currpos = (char *) Insert->currpage +
1086                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1087         freespace = INSERT_FREESPACE(Insert);
1088
1089         /*
1090          * The recptr I return is the beginning of the *next* record. This will be
1091          * stored as LSN for changed data pages...
1092          */
1093         INSERT_RECPTR(RecPtr, Insert, curridx);
1094
1095         /*
1096          * If the record is an XLOG_SWITCH, we must now write and flush all the
1097          * existing data, and then forcibly advance to the start of the next
1098          * segment.  It's not good to do this I/O while holding the insert lock,
1099          * but there seems too much risk of confusion if we try to release the
1100          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1101          * operation anyway...
1102          */
1103         if (isLogSwitch)
1104         {
1105                 XLogwrtRqst FlushRqst;
1106                 XLogRecPtr      OldSegEnd;
1107
1108                 TRACE_POSTGRESQL_XLOG_SWITCH();
1109
1110                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1111
1112                 /*
1113                  * Flush through the end of the page containing XLOG_SWITCH, and
1114                  * perform end-of-segment actions (eg, notifying archiver).
1115                  */
1116                 WriteRqst = XLogCtl->xlblocks[curridx];
1117                 FlushRqst.Write = WriteRqst;
1118                 FlushRqst.Flush = WriteRqst;
1119                 XLogWrite(FlushRqst, false, true);
1120
1121                 /* Set up the next buffer as first page of next segment */
1122                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1123                 (void) AdvanceXLInsertBuffer(true);
1124
1125                 /* There should be no unwritten data */
1126                 curridx = Insert->curridx;
1127                 Assert(curridx == XLogCtl->Write.curridx);
1128
1129                 /* Compute end address of old segment */
1130                 OldSegEnd = XLogCtl->xlblocks[curridx];
1131                 OldSegEnd -= XLOG_BLCKSZ;
1132
1133                 /* Make it look like we've written and synced all of old segment */
1134                 LogwrtResult.Write = OldSegEnd;
1135                 LogwrtResult.Flush = OldSegEnd;
1136
1137                 /*
1138                  * Update shared-memory status --- this code should match XLogWrite
1139                  */
1140                 {
1141                         /* use volatile pointer to prevent code rearrangement */
1142                         volatile XLogCtlData *xlogctl = XLogCtl;
1143
1144                         SpinLockAcquire(&xlogctl->info_lck);
1145                         xlogctl->LogwrtResult = LogwrtResult;
1146                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1147                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1148                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1149                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1150                         SpinLockRelease(&xlogctl->info_lck);
1151                 }
1152
1153                 LWLockRelease(WALWriteLock);
1154
1155                 updrqst = false;                /* done already */
1156         }
1157         else
1158         {
1159                 /* normal case, ie not xlog switch */
1160
1161                 /* Need to update shared LogwrtRqst if some block was filled up */
1162                 if (freespace == 0)
1163                 {
1164                         /* curridx is filled and available for writing out */
1165                         updrqst = true;
1166                 }
1167                 else
1168                 {
1169                         /* if updrqst already set, write through end of previous buf */
1170                         curridx = PrevBufIdx(curridx);
1171                 }
1172                 WriteRqst = XLogCtl->xlblocks[curridx];
1173         }
1174
1175         LWLockRelease(WALInsertLock);
1176
1177         if (updrqst)
1178         {
1179                 /* use volatile pointer to prevent code rearrangement */
1180                 volatile XLogCtlData *xlogctl = XLogCtl;
1181
1182                 SpinLockAcquire(&xlogctl->info_lck);
1183                 /* advance global request to include new block(s) */
1184                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1185                         xlogctl->LogwrtRqst.Write = WriteRqst;
1186                 /* update local result copy while I have the chance */
1187                 LogwrtResult = xlogctl->LogwrtResult;
1188                 SpinLockRelease(&xlogctl->info_lck);
1189         }
1190
1191         XactLastRecEnd = RecPtr;
1192
1193         END_CRIT_SECTION();
1194
1195         /* wake up walsenders now that we've released heavily contended locks */
1196         WalSndWakeupProcessRequests();
1197
1198         return RecPtr;
1199 }
1200
1201 /*
1202  * Determine whether the buffer referenced by an XLogRecData item has to
1203  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1204  * save the buffer's LSN at *lsn.
1205  */
1206 static bool
1207 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1208                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1209 {
1210         Page            page;
1211
1212         page = BufferGetPage(rdata->buffer);
1213
1214         /*
1215          * XXX We assume page LSN is first data on *every* page that can be passed
1216          * to XLogInsert, whether it otherwise has the standard page layout or
1217          * not. We don't need the buffer header lock for PageGetLSN because we
1218          * have exclusive lock on the page and/or the relation.
1219          */
1220         *lsn = PageGetLSN(page);
1221
1222         if (doPageWrites &&
1223                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1224         {
1225                 /*
1226                  * The page needs to be backed up, so set up *bkpb
1227                  */
1228                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1229
1230                 if (rdata->buffer_std)
1231                 {
1232                         /* Assume we can omit data between pd_lower and pd_upper */
1233                         uint16          lower = ((PageHeader) page)->pd_lower;
1234                         uint16          upper = ((PageHeader) page)->pd_upper;
1235
1236                         if (lower >= SizeOfPageHeaderData &&
1237                                 upper > lower &&
1238                                 upper <= BLCKSZ)
1239                         {
1240                                 bkpb->hole_offset = lower;
1241                                 bkpb->hole_length = upper - lower;
1242                         }
1243                         else
1244                         {
1245                                 /* No "hole" to compress out */
1246                                 bkpb->hole_offset = 0;
1247                                 bkpb->hole_length = 0;
1248                         }
1249                 }
1250                 else
1251                 {
1252                         /* Not a standard page header, don't try to eliminate "hole" */
1253                         bkpb->hole_offset = 0;
1254                         bkpb->hole_length = 0;
1255                 }
1256
1257                 return true;                    /* buffer requires backup */
1258         }
1259
1260         return false;                           /* buffer does not need to be backed up */
1261 }
1262
1263 /*
1264  * Advance the Insert state to the next buffer page, writing out the next
1265  * buffer if it still contains unwritten data.
1266  *
1267  * If new_segment is TRUE then we set up the next buffer page as the first
1268  * page of the next xlog segment file, possibly but not usually the next
1269  * consecutive file page.
1270  *
1271  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1272  * just-filled page.  If we can do this for free (without an extra lock),
1273  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1274  * request update still needs to be done, FALSE if we did it internally.
1275  *
1276  * Must be called with WALInsertLock held.
1277  */
1278 static bool
1279 AdvanceXLInsertBuffer(bool new_segment)
1280 {
1281         XLogCtlInsert *Insert = &XLogCtl->Insert;
1282         int                     nextidx = NextBufIdx(Insert->curridx);
1283         bool            update_needed = true;
1284         XLogRecPtr      OldPageRqstPtr;
1285         XLogwrtRqst WriteRqst;
1286         XLogRecPtr      NewPageEndPtr;
1287         XLogRecPtr      NewPageBeginPtr;
1288         XLogPageHeader NewPage;
1289
1290         /*
1291          * Get ending-offset of the buffer page we need to replace (this may be
1292          * zero if the buffer hasn't been used yet).  Fall through if it's already
1293          * written out.
1294          */
1295         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1296         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1297         {
1298                 /* nope, got work to do... */
1299                 XLogRecPtr      FinishedPageRqstPtr;
1300
1301                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1302
1303                 /* Before waiting, get info_lck and update LogwrtResult */
1304                 {
1305                         /* use volatile pointer to prevent code rearrangement */
1306                         volatile XLogCtlData *xlogctl = XLogCtl;
1307
1308                         SpinLockAcquire(&xlogctl->info_lck);
1309                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1310                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1311                         LogwrtResult = xlogctl->LogwrtResult;
1312                         SpinLockRelease(&xlogctl->info_lck);
1313                 }
1314
1315                 update_needed = false;  /* Did the shared-request update */
1316
1317                 /*
1318                  * Now that we have an up-to-date LogwrtResult value, see if we still
1319                  * need to write it or if someone else already did.
1320                  */
1321                 if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1322                 {
1323                         /* Must acquire write lock */
1324                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1325                         LogwrtResult = XLogCtl->LogwrtResult;
1326                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1327                         {
1328                                 /* OK, someone wrote it already */
1329                                 LWLockRelease(WALWriteLock);
1330                         }
1331                         else
1332                         {
1333                                 /*
1334                                  * Have to write buffers while holding insert lock. This is
1335                                  * not good, so only write as much as we absolutely must.
1336                                  */
1337                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1338                                 WriteRqst.Write = OldPageRqstPtr;
1339                                 WriteRqst.Flush = 0;
1340                                 XLogWrite(WriteRqst, false, false);
1341                                 LWLockRelease(WALWriteLock);
1342                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1343                         }
1344                 }
1345         }
1346
1347         /*
1348          * Now the next buffer slot is free and we can set it up to be the next
1349          * output page.
1350          */
1351         NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];
1352
1353         if (new_segment)
1354         {
1355                 /* force it to a segment start point */
1356                 if (NewPageBeginPtr % XLogSegSize != 0)
1357                         XLByteAdvance(NewPageBeginPtr,
1358                                                   XLogSegSize - NewPageBeginPtr % XLogSegSize);
1359         }
1360
1361         NewPageEndPtr = NewPageBeginPtr;
1362         XLByteAdvance(NewPageEndPtr, XLOG_BLCKSZ);
1363         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1364         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1365
1366         Insert->curridx = nextidx;
1367         Insert->currpage = NewPage;
1368
1369         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1370
1371         /*
1372          * Be sure to re-zero the buffer so that bytes beyond what we've written
1373          * will look like zeroes and not valid XLOG records...
1374          */
1375         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1376
1377         /*
1378          * Fill the new page's header
1379          */
1380         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1381
1382         /* NewPage->xlp_info = 0; */    /* done by memset */
1383         NewPage   ->xlp_tli = ThisTimeLineID;
1384         NewPage   ->xlp_pageaddr = NewPageBeginPtr;
1385
1386         /*
1387          * If online backup is not in progress, mark the header to indicate that
1388          * WAL records beginning in this page have removable backup blocks.  This
1389          * allows the WAL archiver to know whether it is safe to compress archived
1390          * WAL data by transforming full-block records into the non-full-block
1391          * format.      It is sufficient to record this at the page level because we
1392          * force a page switch (in fact a segment switch) when starting a backup,
1393          * so the flag will be off before any records can be written during the
1394          * backup.      At the end of a backup, the last page will be marked as all
1395          * unsafe when perhaps only part is unsafe, but at worst the archiver
1396          * would miss the opportunity to compress a few records.
1397          */
1398         if (!Insert->forcePageWrites)
1399                 NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1400
1401         /*
1402          * If first page of an XLOG segment file, make it a long header.
1403          */
1404         if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
1405         {
1406                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1407
1408                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1409                 NewLongPage->xlp_seg_size = XLogSegSize;
1410                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1411                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1412
1413                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1414         }
1415
1416         return update_needed;
1417 }
1418
1419 /*
1420  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1421  *
1422  * new_segno indicates a log file that has just been filled up (or read
1423  * during recovery). We measure the distance from RedoRecPtr to new_segno
1424  * and see if that exceeds CheckPointSegments.
1425  *
1426  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1427  */
1428 static bool
1429 XLogCheckpointNeeded(XLogSegNo new_segno)
1430 {
1431         XLogSegNo       old_segno;
1432
1433         XLByteToSeg(RedoRecPtr, old_segno);
1434
1435         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
1436                 return true;
1437         return false;
1438 }
1439
1440 /*
1441  * Write and/or fsync the log at least as far as WriteRqst indicates.
1442  *
1443  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1444  * may stop at any convenient boundary (such as a cache or logfile boundary).
1445  * This option allows us to avoid uselessly issuing multiple writes when a
1446  * single one would do.
1447  *
1448  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1449  * perform end-of-segment actions after writing the last page, even if
1450  * it's not physically the end of its segment.  (NB: this will work properly
1451  * only if caller specifies WriteRqst == page-end and flexible == false,
1452  * and there is some data to write.)
1453  *
1454  * Must be called with WALWriteLock held.
1455  */
1456 static void
1457 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1458 {
1459         XLogCtlWrite *Write = &XLogCtl->Write;
1460         bool            ispartialpage;
1461         bool            last_iteration;
1462         bool            finishing_seg;
1463         bool            use_existent;
1464         int                     curridx;
1465         int                     npages;
1466         int                     startidx;
1467         uint32          startoffset;
1468
1469         /* We should always be inside a critical section here */
1470         Assert(CritSectionCount > 0);
1471
1472         /*
1473          * Update local LogwrtResult (caller probably did this already, but...)
1474          */
1475         LogwrtResult = XLogCtl->LogwrtResult;
1476
1477         /*
1478          * Since successive pages in the xlog cache are consecutively allocated,
1479          * we can usually gather multiple pages together and issue just one
1480          * write() call.  npages is the number of pages we have determined can be
1481          * written together; startidx is the cache block index of the first one,
1482          * and startoffset is the file offset at which it should go. The latter
1483          * two variables are only valid when npages > 0, but we must initialize
1484          * all of them to keep the compiler quiet.
1485          */
1486         npages = 0;
1487         startidx = 0;
1488         startoffset = 0;
1489
1490         /*
1491          * Within the loop, curridx is the cache block index of the page to
1492          * consider writing.  We advance Write->curridx only after successfully
1493          * writing pages.  (Right now, this refinement is useless since we are
1494          * going to PANIC if any error occurs anyway; but someday it may come in
1495          * useful.)
1496          */
1497         curridx = Write->curridx;
1498
1499         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1500         {
1501                 /*
1502                  * Make sure we're not ahead of the insert process.  This could happen
1503                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1504                  * last page that's been initialized by AdvanceXLInsertBuffer.
1505                  */
1506                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1507                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1508                                  (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
1509                                  (uint32) (XLogCtl->xlblocks[curridx] >> 32),
1510                                  (uint32) XLogCtl->xlblocks[curridx]);
1511
1512                 /* Advance LogwrtResult.Write to end of current buffer page */
1513                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1514                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1515
1516                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1517                 {
1518                         /*
1519                          * Switch to new logfile segment.  We cannot have any pending
1520                          * pages here (since we dump what we have at segment end).
1521                          */
1522                         Assert(npages == 0);
1523                         if (openLogFile >= 0)
1524                                 XLogFileClose();
1525                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1526
1527                         /* create/use new log file */
1528                         use_existent = true;
1529                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
1530                         openLogOff = 0;
1531                 }
1532
1533                 /* Make sure we have the current logfile open */
1534                 if (openLogFile < 0)
1535                 {
1536                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1537                         openLogFile = XLogFileOpen(openLogSegNo);
1538                         openLogOff = 0;
1539                 }
1540
1541                 /* Add current page to the set of pending pages-to-dump */
1542                 if (npages == 0)
1543                 {
1544                         /* first of group */
1545                         startidx = curridx;
1546                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
1547                 }
1548                 npages++;
1549
1550                 /*
1551                  * Dump the set if this will be the last loop iteration, or if we are
1552                  * at the last page of the cache area (since the next page won't be
1553                  * contiguous in memory), or if we are at the end of the logfile
1554                  * segment.
1555                  */
1556                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1557
1558                 finishing_seg = !ispartialpage &&
1559                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1560
1561                 if (last_iteration ||
1562                         curridx == XLogCtl->XLogCacheBlck ||
1563                         finishing_seg)
1564                 {
1565                         char       *from;
1566                         Size            nbytes;
1567
1568                         /* Need to seek in the file? */
1569                         if (openLogOff != startoffset)
1570                         {
1571                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1572                                         ereport(PANIC,
1573                                                         (errcode_for_file_access(),
1574                                                          errmsg("could not seek in log file %s to offset %u: %m",
1575                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
1576                                                                         startoffset)));
1577                                 openLogOff = startoffset;
1578                         }
1579
1580                         /* OK to write the page(s) */
1581                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1582                         nbytes = npages * (Size) XLOG_BLCKSZ;
1583                         errno = 0;
1584                         if (write(openLogFile, from, nbytes) != nbytes)
1585                         {
1586                                 /* if write didn't set errno, assume no disk space */
1587                                 if (errno == 0)
1588                                         errno = ENOSPC;
1589                                 ereport(PANIC,
1590                                                 (errcode_for_file_access(),
1591                                                  errmsg("could not write to log file %s "
1592                                                                 "at offset %u, length %lu: %m",
1593                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo),
1594                                                                 openLogOff, (unsigned long) nbytes)));
1595                         }
1596
1597                         /* Update state for write */
1598                         openLogOff += nbytes;
1599                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1600                         npages = 0;
1601
1602                         /*
1603                          * If we just wrote the whole last page of a logfile segment,
1604                          * fsync the segment immediately.  This avoids having to go back
1605                          * and re-open prior segments when an fsync request comes along
1606                          * later. Doing it here ensures that one and only one backend will
1607                          * perform this fsync.
1608                          *
1609                          * We also do this if this is the last page written for an xlog
1610                          * switch.
1611                          *
1612                          * This is also the right place to notify the Archiver that the
1613                          * segment is ready to copy to archival storage, and to update the
1614                          * timer for archive_timeout, and to signal for a checkpoint if
1615                          * too many logfile segments have been used since the last
1616                          * checkpoint.
1617                          */
1618                         if (finishing_seg || (xlog_switch && last_iteration))
1619                         {
1620                                 issue_xlog_fsync(openLogFile, openLogSegNo);
1621
1622                                 /* signal that we need to wakeup walsenders later */
1623                                 WalSndWakeupRequest();
1624
1625                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1626
1627                                 if (XLogArchivingActive())
1628                                         XLogArchiveNotifySeg(openLogSegNo);
1629
1630                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1631
1632                                 /*
1633                                  * Request a checkpoint if we've consumed too much xlog since
1634                                  * the last one.  For speed, we first check using the local
1635                                  * copy of RedoRecPtr, which might be out of date; if it looks
1636                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
1637                                  * recheck.
1638                                  */
1639                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
1640                                 {
1641                                         (void) GetRedoRecPtr();
1642                                         if (XLogCheckpointNeeded(openLogSegNo))
1643                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1644                                 }
1645                         }
1646                 }
1647
1648                 if (ispartialpage)
1649                 {
1650                         /* Only asked to write a partial page */
1651                         LogwrtResult.Write = WriteRqst.Write;
1652                         break;
1653                 }
1654                 curridx = NextBufIdx(curridx);
1655
1656                 /* If flexible, break out of loop as soon as we wrote something */
1657                 if (flexible && npages == 0)
1658                         break;
1659         }
1660
1661         Assert(npages == 0);
1662         Assert(curridx == Write->curridx);
1663
1664         /*
1665          * If asked to flush, do so
1666          */
1667         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1668                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1669         {
1670                 /*
1671                  * Could get here without iterating above loop, in which case we might
1672                  * have no open file or the wrong one.  However, we do not need to
1673                  * fsync more than one file.
1674                  */
1675                 if (sync_method != SYNC_METHOD_OPEN &&
1676                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1677                 {
1678                         if (openLogFile >= 0 &&
1679                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1680                                 XLogFileClose();
1681                         if (openLogFile < 0)
1682                         {
1683                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1684                                 openLogFile = XLogFileOpen(openLogSegNo);
1685                                 openLogOff = 0;
1686                         }
1687
1688                         issue_xlog_fsync(openLogFile, openLogSegNo);
1689                 }
1690
1691                 /* signal that we need to wakeup walsenders later */
1692                 WalSndWakeupRequest();
1693
1694                 LogwrtResult.Flush = LogwrtResult.Write;
1695         }
1696
1697         /*
1698          * Update shared-memory status
1699          *
1700          * We make sure that the shared 'request' values do not fall behind the
1701          * 'result' values.  This is not absolutely essential, but it saves some
1702          * code in a couple of places.
1703          */
1704         {
1705                 /* use volatile pointer to prevent code rearrangement */
1706                 volatile XLogCtlData *xlogctl = XLogCtl;
1707
1708                 SpinLockAcquire(&xlogctl->info_lck);
1709                 xlogctl->LogwrtResult = LogwrtResult;
1710                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1711                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1712                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1713                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1714                 SpinLockRelease(&xlogctl->info_lck);
1715         }
1716 }
1717
1718 /*
1719  * Record the LSN for an asynchronous transaction commit/abort
1720  * and nudge the WALWriter if there is work for it to do.
1721  * (This should not be called for synchronous commits.)
1722  */
1723 void
1724 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1725 {
1726         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
1727         bool            sleeping;
1728
1729         /* use volatile pointer to prevent code rearrangement */
1730         volatile XLogCtlData *xlogctl = XLogCtl;
1731
1732         SpinLockAcquire(&xlogctl->info_lck);
1733         LogwrtResult = xlogctl->LogwrtResult;
1734         sleeping = xlogctl->WalWriterSleeping;
1735         if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
1736                 xlogctl->asyncXactLSN = asyncXactLSN;
1737         SpinLockRelease(&xlogctl->info_lck);
1738
1739         /*
1740          * If the WALWriter is sleeping, we should kick it to make it come out of
1741          * low-power mode.      Otherwise, determine whether there's a full page of
1742          * WAL available to write.
1743          */
1744         if (!sleeping)
1745         {
1746                 /* back off to last completed page boundary */
1747                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
1748
1749                 /* if we have already flushed that far, we're done */
1750                 if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1751                         return;
1752         }
1753
1754         /*
1755          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
1756          * to come out of low-power mode so that this async commit will reach disk
1757          * within the expected amount of time.
1758          */
1759         if (ProcGlobal->walwriterLatch)
1760                 SetLatch(ProcGlobal->walwriterLatch);
1761 }
1762
1763 /*
1764  * Advance minRecoveryPoint in control file.
1765  *
1766  * If we crash during recovery, we must reach this point again before the
1767  * database is consistent.
1768  *
1769  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1770  * is only updated if it's not already greater than or equal to 'lsn'.
1771  */
1772 static void
1773 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1774 {
1775         /* Quick check using our local copy of the variable */
1776         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1777                 return;
1778
1779         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1780
1781         /* update local copy */
1782         minRecoveryPoint = ControlFile->minRecoveryPoint;
1783         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1784
1785         /*
1786          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1787          * i.e., we're doing crash recovery.  We never modify the control file's
1788          * value in that case, so we can short-circuit future checks here too.
1789          */
1790         if (minRecoveryPoint == 0)
1791                 updateMinRecoveryPoint = false;
1792         else if (force || XLByteLT(minRecoveryPoint, lsn))
1793         {
1794                 /* use volatile pointer to prevent code rearrangement */
1795                 volatile XLogCtlData *xlogctl = XLogCtl;
1796                 XLogRecPtr      newMinRecoveryPoint;
1797                 TimeLineID      newMinRecoveryPointTLI;
1798
1799                 /*
1800                  * To avoid having to update the control file too often, we update it
1801                  * all the way to the last record being replayed, even though 'lsn'
1802                  * would suffice for correctness.  This also allows the 'force' case
1803                  * to not need a valid 'lsn' value.
1804                  *
1805                  * Another important reason for doing it this way is that the passed
1806                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
1807                  * the caller got it from a corrupted heap page.  Accepting such a
1808                  * value as the min recovery point would prevent us from coming up at
1809                  * all.  Instead, we just log a warning and continue with recovery.
1810                  * (See also the comments about corrupt LSNs in XLogFlush.)
1811                  */
1812                 SpinLockAcquire(&xlogctl->info_lck);
1813                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1814                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
1815                 SpinLockRelease(&xlogctl->info_lck);
1816
1817                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
1818                         elog(WARNING,
1819                            "xlog min recovery request %X/%X is past current point %X/%X",
1820                                  (uint32) (lsn >> 32) , (uint32) lsn,
1821                                  (uint32) (newMinRecoveryPoint >> 32),
1822                                  (uint32) newMinRecoveryPoint);
1823
1824                 /* update control file */
1825                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
1826                 {
1827                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1828                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
1829                         UpdateControlFile();
1830                         minRecoveryPoint = newMinRecoveryPoint;
1831                         minRecoveryPointTLI = newMinRecoveryPointTLI;
1832
1833                         ereport(DEBUG2,
1834                                         (errmsg("updated min recovery point to %X/%X on timeline %u",
1835                                                         (uint32) (minRecoveryPoint >> 32),
1836                                                         (uint32) minRecoveryPoint,
1837                                                         newMinRecoveryPointTLI)));
1838                 }
1839         }
1840         LWLockRelease(ControlFileLock);
1841 }
1842
1843 /*
1844  * Ensure that all XLOG data through the given position is flushed to disk.
1845  *
1846  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1847  * already held, and we try to avoid acquiring it if possible.
1848  */
1849 void
1850 XLogFlush(XLogRecPtr record)
1851 {
1852         XLogRecPtr      WriteRqstPtr;
1853         XLogwrtRqst WriteRqst;
1854
1855         /*
1856          * During REDO, we are reading not writing WAL.  Therefore, instead of
1857          * trying to flush the WAL, we should update minRecoveryPoint instead. We
1858          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
1859          * to act this way too, and because when it tries to write the
1860          * end-of-recovery checkpoint, it should indeed flush.
1861          */
1862         if (!XLogInsertAllowed())
1863         {
1864                 UpdateMinRecoveryPoint(record, false);
1865                 return;
1866         }
1867
1868         /* Quick exit if already known flushed */
1869         if (XLByteLE(record, LogwrtResult.Flush))
1870                 return;
1871
1872 #ifdef WAL_DEBUG
1873         if (XLOG_DEBUG)
1874                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1875                          (uint32) (record >> 32), (uint32) record,
1876                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
1877                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
1878 #endif
1879
1880         START_CRIT_SECTION();
1881
1882         /*
1883          * Since fsync is usually a horribly expensive operation, we try to
1884          * piggyback as much data as we can on each fsync: if we see any more data
1885          * entered into the xlog buffer, we'll write and fsync that too, so that
1886          * the final value of LogwrtResult.Flush is as large as possible. This
1887          * gives us some chance of avoiding another fsync immediately after.
1888          */
1889
1890         /* initialize to given target; may increase below */
1891         WriteRqstPtr = record;
1892
1893         /*
1894          * Now wait until we get the write lock, or someone else does the flush
1895          * for us.
1896          */
1897         for (;;)
1898         {
1899                 /* use volatile pointer to prevent code rearrangement */
1900                 volatile XLogCtlData *xlogctl = XLogCtl;
1901
1902                 /* read LogwrtResult and update local state */
1903                 SpinLockAcquire(&xlogctl->info_lck);
1904                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1905                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1906                 LogwrtResult = xlogctl->LogwrtResult;
1907                 SpinLockRelease(&xlogctl->info_lck);
1908
1909                 /* done already? */
1910                 if (XLByteLE(record, LogwrtResult.Flush))
1911                         break;
1912
1913                 /*
1914                  * Try to get the write lock. If we can't get it immediately, wait
1915                  * until it's released, and recheck if we still need to do the flush
1916                  * or if the backend that held the lock did it for us already. This
1917                  * helps to maintain a good rate of group committing when the system
1918                  * is bottlenecked by the speed of fsyncing.
1919                  */
1920                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
1921                 {
1922                         /*
1923                          * The lock is now free, but we didn't acquire it yet. Before we
1924                          * do, loop back to check if someone else flushed the record for
1925                          * us already.
1926                          */
1927                         continue;
1928                 }
1929
1930                 /* Got the lock; recheck whether request is satisfied */
1931                 LogwrtResult = XLogCtl->LogwrtResult;
1932                 if (XLByteLE(record, LogwrtResult.Flush))
1933                 {
1934                         LWLockRelease(WALWriteLock);
1935                         break;
1936                 }
1937
1938                 /*
1939                  * Sleep before flush! By adding a delay here, we may give further
1940                  * backends the opportunity to join the backlog of group commit
1941                  * followers; this can significantly improve transaction throughput, at
1942                  * the risk of increasing transaction latency.
1943                  *
1944                  * We do not sleep if enableFsync is not turned on, nor if there are
1945                  * fewer than CommitSiblings other backends with active transactions.
1946                  */
1947                 if (CommitDelay > 0 && enableFsync &&
1948                         MinimumActiveBackends(CommitSiblings))
1949                         pg_usleep(CommitDelay);
1950
1951                 /* try to write/flush later additions to XLOG as well */
1952                 if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1953                 {
1954                         XLogCtlInsert *Insert = &XLogCtl->Insert;
1955                         uint32          freespace = INSERT_FREESPACE(Insert);
1956
1957                         if (freespace == 0)             /* buffer is full */
1958                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1959                         else
1960                         {
1961                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1962                                 WriteRqstPtr -= freespace;
1963                         }
1964                         LWLockRelease(WALInsertLock);
1965                         WriteRqst.Write = WriteRqstPtr;
1966                         WriteRqst.Flush = WriteRqstPtr;
1967                 }
1968                 else
1969                 {
1970                         WriteRqst.Write = WriteRqstPtr;
1971                         WriteRqst.Flush = record;
1972                 }
1973                 XLogWrite(WriteRqst, false, false);
1974
1975                 LWLockRelease(WALWriteLock);
1976                 /* done */
1977                 break;
1978         }
1979
1980         END_CRIT_SECTION();
1981
1982         /* wake up walsenders now that we've released heavily contended locks */
1983         WalSndWakeupProcessRequests();
1984
1985         /*
1986          * If we still haven't flushed to the request point then we have a
1987          * problem; most likely, the requested flush point is past end of XLOG.
1988          * This has been seen to occur when a disk page has a corrupted LSN.
1989          *
1990          * Formerly we treated this as a PANIC condition, but that hurts the
1991          * system's robustness rather than helping it: we do not want to take down
1992          * the whole system due to corruption on one data page.  In particular, if
1993          * the bad page is encountered again during recovery then we would be
1994          * unable to restart the database at all!  (This scenario actually
1995          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
1996          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
1997          * the only time we can reach here during recovery is while flushing the
1998          * end-of-recovery checkpoint record, and we don't expect that to have a
1999          * bad LSN.
2000          *
2001          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2002          * since xact.c calls this routine inside a critical section.  However,
2003          * calls from bufmgr.c are not within critical sections and so we will not
2004          * force a restart for a bad LSN on a data page.
2005          */
2006         if (XLByteLT(LogwrtResult.Flush, record))
2007                 elog(ERROR,
2008                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2009                          (uint32) (record >> 32), (uint32) record,
2010                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2011 }
2012
2013 /*
2014  * Flush xlog, but without specifying exactly where to flush to.
2015  *
2016  * We normally flush only completed blocks; but if there is nothing to do on
2017  * that basis, we check for unflushed async commits in the current incomplete
2018  * block, and flush through the latest one of those.  Thus, if async commits
2019  * are not being used, we will flush complete blocks only.      We can guarantee
2020  * that async commits reach disk after at most three cycles; normally only
2021  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2022  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2023  * difference only with very high load or long wal_writer_delay, but imposes
2024  * one extra cycle for the worst case for async commits.)
2025  *
2026  * This routine is invoked periodically by the background walwriter process.
2027  *
2028  * Returns TRUE if we flushed anything.
2029  */
2030 bool
2031 XLogBackgroundFlush(void)
2032 {
2033         XLogRecPtr      WriteRqstPtr;
2034         bool            flexible = true;
2035         bool            wrote_something = false;
2036
2037         /* XLOG doesn't need flushing during recovery */
2038         if (RecoveryInProgress())
2039                 return false;
2040
2041         /* read LogwrtResult and update local state */
2042         {
2043                 /* use volatile pointer to prevent code rearrangement */
2044                 volatile XLogCtlData *xlogctl = XLogCtl;
2045
2046                 SpinLockAcquire(&xlogctl->info_lck);
2047                 LogwrtResult = xlogctl->LogwrtResult;
2048                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2049                 SpinLockRelease(&xlogctl->info_lck);
2050         }
2051
2052         /* back off to last completed page boundary */
2053         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2054
2055         /* if we have already flushed that far, consider async commit records */
2056         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2057         {
2058                 /* use volatile pointer to prevent code rearrangement */
2059                 volatile XLogCtlData *xlogctl = XLogCtl;
2060
2061                 SpinLockAcquire(&xlogctl->info_lck);
2062                 WriteRqstPtr = xlogctl->asyncXactLSN;
2063                 SpinLockRelease(&xlogctl->info_lck);
2064                 flexible = false;               /* ensure it all gets written */
2065         }
2066
2067         /*
2068          * If already known flushed, we're done. Just need to check if we are
2069          * holding an open file handle to a logfile that's no longer in use,
2070          * preventing the file from being deleted.
2071          */
2072         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2073         {
2074                 if (openLogFile >= 0)
2075                 {
2076                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2077                         {
2078                                 XLogFileClose();
2079                         }
2080                 }
2081                 return false;
2082         }
2083
2084 #ifdef WAL_DEBUG
2085         if (XLOG_DEBUG)
2086                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2087                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2088                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2089                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2090 #endif
2091
2092         START_CRIT_SECTION();
2093
2094         /* now wait for the write lock */
2095         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2096         LogwrtResult = XLogCtl->LogwrtResult;
2097         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2098         {
2099                 XLogwrtRqst WriteRqst;
2100
2101                 WriteRqst.Write = WriteRqstPtr;
2102                 WriteRqst.Flush = WriteRqstPtr;
2103                 XLogWrite(WriteRqst, flexible, false);
2104                 wrote_something = true;
2105         }
2106         LWLockRelease(WALWriteLock);
2107
2108         END_CRIT_SECTION();
2109
2110         /* wake up walsenders now that we've released heavily contended locks */
2111         WalSndWakeupProcessRequests();
2112
2113         return wrote_something;
2114 }
2115
2116 /*
2117  * Test whether XLOG data has been flushed up to (at least) the given position.
2118  *
2119  * Returns true if a flush is still needed.  (It may be that someone else
2120  * is already in process of flushing that far, however.)
2121  */
2122 bool
2123 XLogNeedsFlush(XLogRecPtr record)
2124 {
2125         /*
2126          * During recovery, we don't flush WAL but update minRecoveryPoint
2127          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2128          * would need to be updated.
2129          */
2130         if (RecoveryInProgress())
2131         {
2132                 /* Quick exit if already known updated */
2133                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2134                         return false;
2135
2136                 /*
2137                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2138                  * just return a conservative guess.
2139                  */
2140                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2141                         return true;
2142                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2143                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2144                 LWLockRelease(ControlFileLock);
2145
2146                 /*
2147                  * An invalid minRecoveryPoint means that we need to recover all the
2148                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2149                  * file's value in that case, so we can short-circuit future checks
2150                  * here too.
2151                  */
2152                 if (minRecoveryPoint == 0)
2153                         updateMinRecoveryPoint = false;
2154
2155                 /* check again */
2156                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2157                         return false;
2158                 else
2159                         return true;
2160         }
2161
2162         /* Quick exit if already known flushed */
2163         if (XLByteLE(record, LogwrtResult.Flush))
2164                 return false;
2165
2166         /* read LogwrtResult and update local state */
2167         {
2168                 /* use volatile pointer to prevent code rearrangement */
2169                 volatile XLogCtlData *xlogctl = XLogCtl;
2170
2171                 SpinLockAcquire(&xlogctl->info_lck);
2172                 LogwrtResult = xlogctl->LogwrtResult;
2173                 SpinLockRelease(&xlogctl->info_lck);
2174         }
2175
2176         /* check again */
2177         if (XLByteLE(record, LogwrtResult.Flush))
2178                 return false;
2179
2180         return true;
2181 }
2182
2183 /*
2184  * Create a new XLOG file segment, or open a pre-existing one.
2185  *
2186  * log, seg: identify segment to be created/opened.
2187  *
2188  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2189  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2190  * file was used.
2191  *
2192  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2193  * place.  This should be TRUE except during bootstrap log creation.  The
2194  * caller must *not* hold the lock at call.
2195  *
2196  * Returns FD of opened file.
2197  *
2198  * Note: errors here are ERROR not PANIC because we might or might not be
2199  * inside a critical section (eg, during checkpoint there is no reason to
2200  * take down the system on failure).  They will promote to PANIC if we are
2201  * in a critical section.
2202  */
2203 int
2204 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
2205 {
2206         char            path[MAXPGPATH];
2207         char            tmppath[MAXPGPATH];
2208         char       *zbuffer;
2209         XLogSegNo       installed_segno;
2210         int                     max_advance;
2211         int                     fd;
2212         int                     nbytes;
2213
2214         XLogFilePath(path, ThisTimeLineID, logsegno);
2215
2216         /*
2217          * Try to use existent file (checkpoint maker may have created it already)
2218          */
2219         if (*use_existent)
2220         {
2221                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2222                                                    S_IRUSR | S_IWUSR);
2223                 if (fd < 0)
2224                 {
2225                         if (errno != ENOENT)
2226                                 ereport(ERROR,
2227                                                 (errcode_for_file_access(),
2228                                                  errmsg("could not open file \"%s\": %m", path)));
2229                 }
2230                 else
2231                         return fd;
2232         }
2233
2234         /*
2235          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2236          * another process is doing the same thing.  If so, we will end up
2237          * pre-creating an extra log segment.  That seems OK, and better than
2238          * holding the lock throughout this lengthy process.
2239          */
2240         elog(DEBUG2, "creating and filling new WAL file");
2241
2242         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2243
2244         unlink(tmppath);
2245
2246         /*
2247          * Allocate a buffer full of zeros. This is done before opening the file
2248          * so that we don't leak the file descriptor if palloc fails.
2249          *
2250          * Note: palloc zbuffer, instead of just using a local char array, to
2251          * ensure it is reasonably well-aligned; this may save a few cycles
2252          * transferring data to the kernel.
2253          */
2254         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2255
2256         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2257         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2258                                            S_IRUSR | S_IWUSR);
2259         if (fd < 0)
2260                 ereport(ERROR,
2261                                 (errcode_for_file_access(),
2262                                  errmsg("could not create file \"%s\": %m", tmppath)));
2263
2264         /*
2265          * Zero-fill the file.  We have to do this the hard way to ensure that all
2266          * the file space has really been allocated --- on platforms that allow
2267          * "holes" in files, just seeking to the end doesn't allocate intermediate
2268          * space.  This way, we know that we have all the space and (after the
2269          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2270          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2271          * log file.
2272          */
2273         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2274         {
2275                 errno = 0;
2276                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2277                 {
2278                         int                     save_errno = errno;
2279
2280                         /*
2281                          * If we fail to make the file, delete it to release disk space
2282                          */
2283                         unlink(tmppath);
2284
2285                         close(fd);
2286
2287                         /* if write didn't set errno, assume problem is no disk space */
2288                         errno = save_errno ? save_errno : ENOSPC;
2289
2290                         ereport(ERROR,
2291                                         (errcode_for_file_access(),
2292                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2293                 }
2294         }
2295         pfree(zbuffer);
2296
2297         if (pg_fsync(fd) != 0)
2298         {
2299                 close(fd);
2300                 ereport(ERROR,
2301                                 (errcode_for_file_access(),
2302                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2303         }
2304
2305         if (close(fd))
2306                 ereport(ERROR,
2307                                 (errcode_for_file_access(),
2308                                  errmsg("could not close file \"%s\": %m", tmppath)));
2309
2310         /*
2311          * Now move the segment into place with its final name.
2312          *
2313          * If caller didn't want to use a pre-existing file, get rid of any
2314          * pre-existing file.  Otherwise, cope with possibility that someone else
2315          * has created the file while we were filling ours: if so, use ours to
2316          * pre-create a future log segment.
2317          */
2318         installed_segno = logsegno;
2319         max_advance = XLOGfileslop;
2320         if (!InstallXLogFileSegment(&installed_segno, tmppath,
2321                                                                 *use_existent, &max_advance,
2322                                                                 use_lock))
2323         {
2324                 /*
2325                  * No need for any more future segments, or InstallXLogFileSegment()
2326                  * failed to rename the file into place. If the rename failed, opening
2327                  * the file below will fail.
2328                  */
2329                 unlink(tmppath);
2330         }
2331
2332         /* Set flag to tell caller there was no existent file */
2333         *use_existent = false;
2334
2335         /* Now open original target segment (might not be file I just made) */
2336         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2337                                            S_IRUSR | S_IWUSR);
2338         if (fd < 0)
2339                 ereport(ERROR,
2340                                 (errcode_for_file_access(),
2341                    errmsg("could not open file \"%s\": %m", path)));
2342
2343         elog(DEBUG2, "done creating and filling new WAL file");
2344
2345         return fd;
2346 }
2347
2348 /*
2349  * Create a new XLOG file segment by copying a pre-existing one.
2350  *
2351  * destsegno: identify segment to be created.
2352  *
2353  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2354  *              a different timeline)
2355  *
2356  * Currently this is only used during recovery, and so there are no locking
2357  * considerations.      But we should be just as tense as XLogFileInit to avoid
2358  * emplacing a bogus file.
2359  */
2360 static void
2361 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
2362 {
2363         char            path[MAXPGPATH];
2364         char            tmppath[MAXPGPATH];
2365         char            buffer[XLOG_BLCKSZ];
2366         int                     srcfd;
2367         int                     fd;
2368         int                     nbytes;
2369
2370         /*
2371          * Open the source file
2372          */
2373         XLogFilePath(path, srcTLI, srcsegno);
2374         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
2375         if (srcfd < 0)
2376                 ereport(ERROR,
2377                                 (errcode_for_file_access(),
2378                                  errmsg("could not open file \"%s\": %m", path)));
2379
2380         /*
2381          * Copy into a temp file name.
2382          */
2383         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2384
2385         unlink(tmppath);
2386
2387         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2388         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2389                                                    S_IRUSR | S_IWUSR);
2390         if (fd < 0)
2391                 ereport(ERROR,
2392                                 (errcode_for_file_access(),
2393                                  errmsg("could not create file \"%s\": %m", tmppath)));
2394
2395         /*
2396          * Do the data copying.
2397          */
2398         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2399         {
2400                 errno = 0;
2401                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2402                 {
2403                         if (errno != 0)
2404                                 ereport(ERROR,
2405                                                 (errcode_for_file_access(),
2406                                                  errmsg("could not read file \"%s\": %m", path)));
2407                         else
2408                                 ereport(ERROR,
2409                                                 (errmsg("not enough data in file \"%s\"", path)));
2410                 }
2411                 errno = 0;
2412                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2413                 {
2414                         int                     save_errno = errno;
2415
2416                         /*
2417                          * If we fail to make the file, delete it to release disk space
2418                          */
2419                         unlink(tmppath);
2420                         /* if write didn't set errno, assume problem is no disk space */
2421                         errno = save_errno ? save_errno : ENOSPC;
2422
2423                         ereport(ERROR,
2424                                         (errcode_for_file_access(),
2425                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2426                 }
2427         }
2428
2429         if (pg_fsync(fd) != 0)
2430                 ereport(ERROR,
2431                                 (errcode_for_file_access(),
2432                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2433
2434         if (CloseTransientFile(fd))
2435                 ereport(ERROR,
2436                                 (errcode_for_file_access(),
2437                                  errmsg("could not close file \"%s\": %m", tmppath)));
2438
2439         CloseTransientFile(srcfd);
2440
2441         /*
2442          * Now move the segment into place with its final name.
2443          */
2444         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
2445                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2446 }
2447
2448 /*
2449  * Install a new XLOG segment file as a current or future log segment.
2450  *
2451  * This is used both to install a newly-created segment (which has a temp
2452  * filename while it's being created) and to recycle an old segment.
2453  *
2454  * *segno: identify segment to install as (or first possible target).
2455  * When find_free is TRUE, this is modified on return to indicate the
2456  * actual installation location or last segment searched.
2457  *
2458  * tmppath: initial name of file to install.  It will be renamed into place.
2459  *
2460  * find_free: if TRUE, install the new segment at the first empty segno
2461  * number at or after the passed numbers.  If FALSE, install the new segment
2462  * exactly where specified, deleting any existing segment file there.
2463  *
2464  * *max_advance: maximum number of segno slots to advance past the starting
2465  * point.  Fail if no free slot is found in this range.  On return, reduced
2466  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2467  * when find_free is FALSE.)
2468  *
2469  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2470  * place.  This should be TRUE except during bootstrap log creation.  The
2471  * caller must *not* hold the lock at call.
2472  *
2473  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2474  * max_advance limit was exceeded, or an error occurred while renaming the
2475  * file into place.
2476  */
2477 static bool
2478 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
2479                                            bool find_free, int *max_advance,
2480                                            bool use_lock)
2481 {
2482         char            path[MAXPGPATH];
2483         struct stat stat_buf;
2484
2485         XLogFilePath(path, ThisTimeLineID, *segno);
2486
2487         /*
2488          * We want to be sure that only one process does this at a time.
2489          */
2490         if (use_lock)
2491                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2492
2493         if (!find_free)
2494         {
2495                 /* Force installation: get rid of any pre-existing segment file */
2496                 unlink(path);
2497         }
2498         else
2499         {
2500                 /* Find a free slot to put it in */
2501                 while (stat(path, &stat_buf) == 0)
2502                 {
2503                         if (*max_advance <= 0)
2504                         {
2505                                 /* Failed to find a free slot within specified range */
2506                                 if (use_lock)
2507                                         LWLockRelease(ControlFileLock);
2508                                 return false;
2509                         }
2510                         (*segno)++;
2511                         (*max_advance)--;
2512                         XLogFilePath(path, ThisTimeLineID, *segno);
2513                 }
2514         }
2515
2516         /*
2517          * Prefer link() to rename() here just to be really sure that we don't
2518          * overwrite an existing file.  However, there shouldn't be one, so
2519          * rename() is an acceptable substitute except for the truly paranoid.
2520          */
2521 #if HAVE_WORKING_LINK
2522         if (link(tmppath, path) < 0)
2523         {
2524                 if (use_lock)
2525                         LWLockRelease(ControlFileLock);
2526                 ereport(LOG,
2527                                 (errcode_for_file_access(),
2528                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
2529                                                 tmppath, path)));
2530                 return false;
2531         }
2532         unlink(tmppath);
2533 #else
2534         if (rename(tmppath, path) < 0)
2535         {
2536                 if (use_lock)
2537                         LWLockRelease(ControlFileLock);
2538                 ereport(LOG,
2539                                 (errcode_for_file_access(),
2540                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
2541                                                 tmppath, path)));
2542                 return false;
2543         }
2544 #endif
2545
2546         if (use_lock)
2547                 LWLockRelease(ControlFileLock);
2548
2549         return true;
2550 }
2551
2552 /*
2553  * Open a pre-existing logfile segment for writing.
2554  */
2555 int
2556 XLogFileOpen(XLogSegNo segno)
2557 {
2558         char            path[MAXPGPATH];
2559         int                     fd;
2560
2561         XLogFilePath(path, ThisTimeLineID, segno);
2562
2563         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2564                                            S_IRUSR | S_IWUSR);
2565         if (fd < 0)
2566                 ereport(PANIC,
2567                                 (errcode_for_file_access(),
2568                                  errmsg("could not open xlog file \"%s\": %m", path)));
2569
2570         return fd;
2571 }
2572
2573 /*
2574  * Open a logfile segment for reading (during recovery).
2575  *
2576  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2577  * Otherwise, it's assumed to be already available in pg_xlog.
2578  */
2579 static int
2580 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
2581                          int source, bool notfoundOk)
2582 {
2583         char            xlogfname[MAXFNAMELEN];
2584         char            activitymsg[MAXFNAMELEN + 16];
2585         char            path[MAXPGPATH];
2586         int                     fd;
2587
2588         XLogFileName(xlogfname, tli, segno);
2589
2590         switch (source)
2591         {
2592                 case XLOG_FROM_ARCHIVE:
2593                         /* Report recovery progress in PS display */
2594                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2595                                          xlogfname);
2596                         set_ps_display(activitymsg, false);
2597
2598                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2599                                                                                                           "RECOVERYXLOG",
2600                                                                                                           XLogSegSize,
2601                                                                                                           InRedo);
2602                         if (!restoredFromArchive)
2603                                 return -1;
2604                         break;
2605
2606                 case XLOG_FROM_PG_XLOG:
2607                 case XLOG_FROM_STREAM:
2608                         XLogFilePath(path, tli, segno);
2609                         restoredFromArchive = false;
2610                         break;
2611
2612                 default:
2613                         elog(ERROR, "invalid XLogFileRead source %d", source);
2614         }
2615
2616         /*
2617          * If the segment was fetched from archival storage, replace the existing
2618          * xlog segment (if any) with the archival version.
2619          */
2620         if (source == XLOG_FROM_ARCHIVE)
2621         {
2622                 char            xlogfpath[MAXPGPATH];
2623                 bool            reload = false;
2624                 struct stat statbuf;
2625
2626                 XLogFilePath(xlogfpath, tli, segno);
2627                 if (stat(xlogfpath, &statbuf) == 0)
2628                 {
2629                         char oldpath[MAXPGPATH];
2630 #ifdef WIN32
2631                         static unsigned int deletedcounter = 1;
2632                         /*
2633                          * On Windows, if another process (e.g a walsender process) holds
2634                          * the file open in FILE_SHARE_DELETE mode, unlink will succeed,
2635                          * but the file will still show up in directory listing until the
2636                          * last handle is closed, and we cannot rename the new file in its
2637                          * place until that. To avoid that problem, rename the old file to
2638                          * a temporary name first. Use a counter to create a unique
2639                          * filename, because the same file might be restored from the
2640                          * archive multiple times, and a walsender could still be holding
2641                          * onto an old deleted version of it.
2642                          */
2643                         snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
2644                                          xlogfpath, deletedcounter++);
2645                         if (rename(xlogfpath, oldpath) != 0)
2646                         {
2647                                 ereport(ERROR,
2648                                                 (errcode_for_file_access(),
2649                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
2650                                                                 xlogfpath, oldpath)));
2651                         }
2652 #else
2653                         strncpy(oldpath, xlogfpath, MAXPGPATH);
2654 #endif
2655                         if (unlink(oldpath) != 0)
2656                                 ereport(FATAL,
2657                                                 (errcode_for_file_access(),
2658                                                  errmsg("could not remove file \"%s\": %m",
2659                                                                 xlogfpath)));
2660                         reload = true;
2661                 }
2662
2663                 if (rename(path, xlogfpath) < 0)
2664                         ereport(ERROR,
2665                                         (errcode_for_file_access(),
2666                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
2667                                                         path, xlogfpath)));
2668
2669                 /*
2670                  * Set path to point at the new file in pg_xlog.
2671                  */
2672                 strncpy(path, xlogfpath, MAXPGPATH);
2673
2674                 /*
2675                  * If the existing segment was replaced, since walsenders might have
2676                  * it open, request them to reload a currently-open segment.
2677                  */
2678                 if (reload)
2679                         WalSndRqstFileReload();
2680
2681                 /* Signal walsender that new WAL has arrived */
2682                 if (AllowCascadeReplication())
2683                         WalSndWakeup();
2684         }
2685
2686         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2687         if (fd >= 0)
2688         {
2689                 /* Success! */
2690                 curFileTLI = tli;
2691
2692                 /* Report recovery progress in PS display */
2693                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2694                                  xlogfname);
2695                 set_ps_display(activitymsg, false);
2696
2697                 /* Track source of data in assorted state variables */
2698                 readSource = source;
2699                 XLogReceiptSource = source;
2700                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2701                 if (source != XLOG_FROM_STREAM)
2702                         XLogReceiptTime = GetCurrentTimestamp();
2703
2704                 /* The file header needs to be validated on first access */
2705                 readFileHeaderValidated = false;
2706
2707                 return fd;
2708         }
2709         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2710                 ereport(PANIC,
2711                                 (errcode_for_file_access(),
2712                                  errmsg("could not open file \"%s\": %m", path)));
2713         return -1;
2714 }
2715
2716 /*
2717  * Open a logfile segment for reading (during recovery).
2718  *
2719  * This version searches for the segment with any TLI listed in expectedTLEs.
2720  */
2721 static int
2722 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
2723 {
2724         char            path[MAXPGPATH];
2725         ListCell   *cell;
2726         int                     fd;
2727
2728         /*
2729          * Loop looking for a suitable timeline ID: we might need to read any of
2730          * the timelines listed in expectedTLEs.
2731          *
2732          * We expect curFileTLI on entry to be the TLI of the preceding file in
2733          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2734          * to go backwards; this prevents us from picking up the wrong file when a
2735          * parent timeline extends to higher segment numbers than the child we
2736          * want to read.
2737          */
2738         foreach(cell, expectedTLEs)
2739         {
2740                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
2741
2742                 if (tli < curFileTLI)
2743                         break;                          /* don't bother looking at too-old TLIs */
2744
2745                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
2746                 {
2747                         fd = XLogFileRead(segno, emode, tli, XLOG_FROM_ARCHIVE, true);
2748                         if (fd != -1)
2749                         {
2750                                 elog(DEBUG1, "got WAL segment from archive");
2751                                 return fd;
2752                         }
2753                 }
2754
2755                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
2756                 {
2757                         fd = XLogFileRead(segno, emode, tli, XLOG_FROM_PG_XLOG, true);
2758                         if (fd != -1)
2759                                 return fd;
2760                 }
2761         }
2762
2763         /* Couldn't find it.  For simplicity, complain about front timeline */
2764         XLogFilePath(path, recoveryTargetTLI, segno);
2765         errno = ENOENT;
2766         ereport(emode,
2767                         (errcode_for_file_access(),
2768                          errmsg("could not open file \"%s\": %m", path)));
2769         return -1;
2770 }
2771
2772 /*
2773  * Close the current logfile segment for writing.
2774  */
2775 static void
2776 XLogFileClose(void)
2777 {
2778         Assert(openLogFile >= 0);
2779
2780         /*
2781          * WAL segment files will not be re-read in normal operation, so we advise
2782          * the OS to release any cached pages.  But do not do so if WAL archiving
2783          * or streaming is active, because archiver and walsender process could
2784          * use the cache to read the WAL segment.
2785          */
2786 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2787         if (!XLogIsNeeded())
2788                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2789 #endif
2790
2791         if (close(openLogFile))
2792                 ereport(PANIC,
2793                                 (errcode_for_file_access(),
2794                                  errmsg("could not close log file %s: %m",
2795                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
2796         openLogFile = -1;
2797 }
2798
2799 /*
2800  * Preallocate log files beyond the specified log endpoint.
2801  *
2802  * XXX this is currently extremely conservative, since it forces only one
2803  * future log segment to exist, and even that only if we are 75% done with
2804  * the current one.  This is only appropriate for very low-WAL-volume systems.
2805  * High-volume systems will be OK once they've built up a sufficient set of
2806  * recycled log segments, but the startup transient is likely to include
2807  * a lot of segment creations by foreground processes, which is not so good.
2808  */
2809 static void
2810 PreallocXlogFiles(XLogRecPtr endptr)
2811 {
2812         XLogSegNo       _logSegNo;
2813         int                     lf;
2814         bool            use_existent;
2815
2816         XLByteToPrevSeg(endptr, _logSegNo);
2817         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
2818         {
2819                 _logSegNo++;
2820                 use_existent = true;
2821                 lf = XLogFileInit(_logSegNo, &use_existent, true);
2822                 close(lf);
2823                 if (!use_existent)
2824                         CheckpointStats.ckpt_segs_added++;
2825         }
2826 }
2827
2828 /*
2829  * Get the segno of the latest removed or recycled WAL segment.
2830  * Returns 0/0 if no WAL segments have been removed since startup.
2831  */
2832 void
2833 XLogGetLastRemoved(XLogSegNo *segno)
2834 {
2835         /* use volatile pointer to prevent code rearrangement */
2836         volatile XLogCtlData *xlogctl = XLogCtl;
2837
2838         SpinLockAcquire(&xlogctl->info_lck);
2839         *segno = xlogctl->lastRemovedSegNo;
2840         SpinLockRelease(&xlogctl->info_lck);
2841 }
2842
2843 /*
2844  * Update the last removed segno pointer in shared memory, to reflect
2845  * that the given XLOG file has been removed.
2846  */
2847 static void
2848 UpdateLastRemovedPtr(char *filename)
2849 {
2850         /* use volatile pointer to prevent code rearrangement */
2851         volatile XLogCtlData *xlogctl = XLogCtl;
2852         uint32          tli;
2853         XLogSegNo       segno;
2854
2855         XLogFromFileName(filename, &tli, &segno);
2856
2857         SpinLockAcquire(&xlogctl->info_lck);
2858         if (segno > xlogctl->lastRemovedSegNo)
2859                 xlogctl->lastRemovedSegNo = segno;
2860         SpinLockRelease(&xlogctl->info_lck);
2861 }
2862
2863 /*
2864  * Recycle or remove all log files older or equal to passed segno
2865  *
2866  * endptr is current (or recent) end of xlog; this is used to determine
2867  * whether we want to recycle rather than delete no-longer-wanted log files.
2868  */
2869 static void
2870 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
2871 {
2872         XLogSegNo       endlogSegNo;
2873         int                     max_advance;
2874         DIR                *xldir;
2875         struct dirent *xlde;
2876         char            lastoff[MAXFNAMELEN];
2877         char            path[MAXPGPATH];
2878
2879 #ifdef WIN32
2880         char            newpath[MAXPGPATH];
2881 #endif
2882         struct stat statbuf;
2883
2884         /*
2885          * Initialize info about where to try to recycle to.  We allow recycling
2886          * segments up to XLOGfileslop segments beyond the current XLOG location.
2887          */
2888         XLByteToPrevSeg(endptr, endlogSegNo);
2889         max_advance = XLOGfileslop;
2890
2891         xldir = AllocateDir(XLOGDIR);
2892         if (xldir == NULL)
2893                 ereport(ERROR,
2894                                 (errcode_for_file_access(),
2895                                  errmsg("could not open transaction log directory \"%s\": %m",
2896                                                 XLOGDIR)));
2897
2898         XLogFileName(lastoff, ThisTimeLineID, segno);
2899
2900         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
2901                  lastoff);
2902
2903         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2904         {
2905                 /*
2906                  * We ignore the timeline part of the XLOG segment identifiers in
2907                  * deciding whether a segment is still needed.  This ensures that we
2908                  * won't prematurely remove a segment from a parent timeline. We could
2909                  * probably be a little more proactive about removing segments of
2910                  * non-parent timelines, but that would be a whole lot more
2911                  * complicated.
2912                  *
2913                  * We use the alphanumeric sorting property of the filenames to decide
2914                  * which ones are earlier than the lastoff segment.
2915                  */
2916                 if (strlen(xlde->d_name) == 24 &&
2917                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2918                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2919                 {
2920                         if (RecoveryInProgress() || XLogArchiveCheckDone(xlde->d_name))
2921                         {
2922                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2923
2924                                 /* Update the last removed location in shared memory first */
2925                                 UpdateLastRemovedPtr(xlde->d_name);
2926
2927                                 /*
2928                                  * Before deleting the file, see if it can be recycled as a
2929                                  * future log segment. Only recycle normal files, pg_standby
2930                                  * for example can create symbolic links pointing to a
2931                                  * separate archive directory.
2932                                  */
2933                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
2934                                         InstallXLogFileSegment(&endlogSegNo, path,
2935                                                                                    true, &max_advance, true))
2936                                 {
2937                                         ereport(DEBUG2,
2938                                                         (errmsg("recycled transaction log file \"%s\"",
2939                                                                         xlde->d_name)));
2940                                         CheckpointStats.ckpt_segs_recycled++;
2941                                         /* Needn't recheck that slot on future iterations */
2942                                         if (max_advance > 0)
2943                                         {
2944                                                 endlogSegNo++;
2945                                                 max_advance--;
2946                                         }
2947                                 }
2948                                 else
2949                                 {
2950                                         /* No need for any more future segments... */
2951                                         int                     rc;
2952
2953                                         ereport(DEBUG2,
2954                                                         (errmsg("removing transaction log file \"%s\"",
2955                                                                         xlde->d_name)));
2956
2957 #ifdef WIN32
2958
2959                                         /*
2960                                          * On Windows, if another process (e.g another backend)
2961                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
2962                                          * will succeed, but the file will still show up in
2963                                          * directory listing until the last handle is closed. To
2964                                          * avoid confusing the lingering deleted file for a live
2965                                          * WAL file that needs to be archived, rename it before
2966                                          * deleting it.
2967                                          *
2968                                          * If another process holds the file open without
2969                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
2970                                          * again at the next checkpoint.
2971                                          */
2972                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
2973                                         if (rename(path, newpath) != 0)
2974                                         {
2975                                                 ereport(LOG,
2976                                                                 (errcode_for_file_access(),
2977                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
2978                                                                                 path)));
2979                                                 continue;
2980                                         }
2981                                         rc = unlink(newpath);
2982 #else
2983                                         rc = unlink(path);
2984 #endif
2985                                         if (rc != 0)
2986                                         {
2987                                                 ereport(LOG,
2988                                                                 (errcode_for_file_access(),
2989                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
2990                                                                                 path)));
2991                                                 continue;
2992                                         }
2993                                         CheckpointStats.ckpt_segs_removed++;
2994                                 }
2995
2996                                 XLogArchiveCleanup(xlde->d_name);
2997                         }
2998                 }
2999         }
3000
3001         FreeDir(xldir);
3002 }
3003
3004 /*
3005  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3006  * If the latter does not exist, recreate it.
3007  *
3008  * It is not the goal of this function to verify the contents of these
3009  * directories, but to help in cases where someone has performed a cluster
3010  * copy for PITR purposes but omitted pg_xlog from the copy.
3011  *
3012  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3013  * policy decision was made not to.  It is fairly common for pg_xlog to be
3014  * a symlink, and if that was the DBA's intent then automatically making a
3015  * plain directory would result in degraded performance with no notice.
3016  */
3017 static void
3018 ValidateXLOGDirectoryStructure(void)
3019 {
3020         char            path[MAXPGPATH];
3021         struct stat stat_buf;
3022
3023         /* Check for pg_xlog; if it doesn't exist, error out */
3024         if (stat(XLOGDIR, &stat_buf) != 0 ||
3025                 !S_ISDIR(stat_buf.st_mode))
3026                 ereport(FATAL,
3027                                 (errmsg("required WAL directory \"%s\" does not exist",
3028                                                 XLOGDIR)));
3029
3030         /* Check for archive_status */
3031         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3032         if (stat(path, &stat_buf) == 0)
3033         {
3034                 /* Check for weird cases where it exists but isn't a directory */
3035                 if (!S_ISDIR(stat_buf.st_mode))
3036                         ereport(FATAL,
3037                                         (errmsg("required WAL directory \"%s\" does not exist",
3038                                                         path)));
3039         }
3040         else
3041         {
3042                 ereport(LOG,
3043                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3044                 if (mkdir(path, S_IRWXU) < 0)
3045                         ereport(FATAL,
3046                                         (errmsg("could not create missing directory \"%s\": %m",
3047                                                         path)));
3048         }
3049 }
3050
3051 /*
3052  * Remove previous backup history files.  This also retries creation of
3053  * .ready files for any backup history files for which XLogArchiveNotify
3054  * failed earlier.
3055  */
3056 static void
3057 CleanupBackupHistory(void)
3058 {
3059         DIR                *xldir;
3060         struct dirent *xlde;
3061         char            path[MAXPGPATH];
3062
3063         xldir = AllocateDir(XLOGDIR);
3064         if (xldir == NULL)
3065                 ereport(ERROR,
3066                                 (errcode_for_file_access(),
3067                                  errmsg("could not open transaction log directory \"%s\": %m",
3068                                                 XLOGDIR)));
3069
3070         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3071         {
3072                 if (strlen(xlde->d_name) > 24 &&
3073                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3074                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3075                                    ".backup") == 0)
3076                 {
3077                         if (XLogArchiveCheckDone(xlde->d_name))
3078                         {
3079                                 ereport(DEBUG2,
3080                                 (errmsg("removing transaction log backup history file \"%s\"",
3081                                                 xlde->d_name)));
3082                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3083                                 unlink(path);
3084                                 XLogArchiveCleanup(xlde->d_name);
3085                         }
3086                 }
3087         }
3088
3089         FreeDir(xldir);
3090 }
3091
3092 /*
3093  * Restore a full-page image from a backup block attached to an XLOG record.
3094  *
3095  * lsn: LSN of the XLOG record being replayed
3096  * record: the complete XLOG record
3097  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
3098  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
3099  * keep_buffer: TRUE to return the buffer still locked and pinned
3100  *
3101  * Returns the buffer number containing the page.  Note this is not terribly
3102  * useful unless keep_buffer is specified as TRUE.
3103  *
3104  * Note: when a backup block is available in XLOG, we restore it
3105  * unconditionally, even if the page in the database appears newer.
3106  * This is to protect ourselves against database pages that were partially
3107  * or incorrectly written during a crash.  We assume that the XLOG data
3108  * must be good because it has passed a CRC check, while the database
3109  * page might not be.  This will force us to replay all subsequent
3110  * modifications of the page that appear in XLOG, rather than possibly
3111  * ignoring them as already applied, but that's not a huge drawback.
3112  *
3113  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
3114  * else a normal exclusive lock is used.  During crash recovery, that's just
3115  * pro forma because there can't be any regular backends in the system, but
3116  * in hot standby mode the distinction is important.
3117  *
3118  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
3119  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
3120  * is needed in some cases when replaying XLOG records that touch multiple
3121  * pages, to prevent inconsistent states from being visible to other backends.
3122  * (Again, that's only important in hot standby mode.)
3123  */
3124 Buffer
3125 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
3126                                    bool get_cleanup_lock, bool keep_buffer)
3127 {
3128         Buffer          buffer;
3129         Page            page;
3130         BkpBlock        bkpb;
3131         char       *blk;
3132         int                     i;
3133
3134         /* Locate requested BkpBlock in the record */
3135         blk = (char *) XLogRecGetData(record) + record->xl_len;
3136         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3137         {
3138                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
3139                         continue;
3140
3141                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3142                 blk += sizeof(BkpBlock);
3143
3144                 if (i == block_index)
3145                 {
3146                         /* Found it, apply the update */
3147                         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3148                                                                                         RBM_ZERO);
3149                         Assert(BufferIsValid(buffer));
3150                         if (get_cleanup_lock)
3151                                 LockBufferForCleanup(buffer);
3152                         else
3153                                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3154
3155                         page = (Page) BufferGetPage(buffer);
3156
3157                         if (bkpb.hole_length == 0)
3158                         {
3159                                 memcpy((char *) page, blk, BLCKSZ);
3160                         }
3161                         else
3162                         {
3163                                 memcpy((char *) page, blk, bkpb.hole_offset);
3164                                 /* must zero-fill the hole */
3165                                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
3166                                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3167                                            blk + bkpb.hole_offset,
3168                                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3169                         }
3170
3171                         PageSetLSN(page, lsn);
3172                         PageSetTLI(page, ThisTimeLineID);
3173                         MarkBufferDirty(buffer);
3174
3175                         if (!keep_buffer)
3176                                 UnlockReleaseBuffer(buffer);
3177
3178                         return buffer;
3179                 }
3180
3181                 blk += BLCKSZ - bkpb.hole_length;
3182         }
3183
3184         /* Caller specified a bogus block_index */
3185         elog(ERROR, "failed to restore block_index %d", block_index);
3186         return InvalidBuffer;           /* keep compiler quiet */
3187 }
3188
3189 /*
3190  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3191  * record (other than to the minimal extent of computing the amount of
3192  * data to read in) until we've checked the CRCs.
3193  *
3194  * We assume all of the record (that is, xl_tot_len bytes) has been read
3195  * into memory at *record.  Also, ValidXLogRecordHeader() has accepted the
3196  * record's header, which means in particular that xl_tot_len is at least
3197  * SizeOfXlogRecord, so it is safe to fetch xl_len.
3198  */
3199 static bool
3200 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3201 {
3202         pg_crc32        crc;
3203         int                     i;
3204         uint32          len = record->xl_len;
3205         BkpBlock        bkpb;
3206         char       *blk;
3207         size_t          remaining = record->xl_tot_len;
3208
3209         /* First the rmgr data */
3210         if (remaining < SizeOfXLogRecord + len)
3211         {
3212                 /* ValidXLogRecordHeader() should've caught this already... */
3213                 ereport(emode_for_corrupt_record(emode, recptr),
3214                                 (errmsg("invalid record length at %X/%X",
3215                                                 (uint32) (recptr >> 32), (uint32) recptr)));
3216                 return false;
3217         }
3218         remaining -= SizeOfXLogRecord + len;
3219         INIT_CRC32(crc);
3220         COMP_CRC32(crc, XLogRecGetData(record), len);
3221
3222         /* Add in the backup blocks, if any */
3223         blk = (char *) XLogRecGetData(record) + len;
3224         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3225         {
3226                 uint32          blen;
3227
3228                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
3229                         continue;
3230
3231                 if (remaining < sizeof(BkpBlock))
3232                 {
3233                         ereport(emode_for_corrupt_record(emode, recptr),
3234                                         (errmsg("invalid backup block size in record at %X/%X",
3235                                                         (uint32) (recptr >> 32), (uint32) recptr)));
3236                         return false;
3237                 }
3238                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3239
3240                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3241                 {
3242                         ereport(emode_for_corrupt_record(emode, recptr),
3243                                         (errmsg("incorrect hole size in record at %X/%X",
3244                                                         (uint32) (recptr >> 32), (uint32) recptr)));
3245                         return false;
3246                 }
3247                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3248
3249                 if (remaining < blen)
3250                 {
3251                         ereport(emode_for_corrupt_record(emode, recptr),
3252                                         (errmsg("invalid backup block size in record at %X/%X",
3253                                                         (uint32) (recptr >> 32), (uint32) recptr)));
3254                         return false;
3255                 }
3256                 remaining -= blen;
3257                 COMP_CRC32(crc, blk, blen);
3258                 blk += blen;
3259         }
3260
3261         /* Check that xl_tot_len agrees with our calculation */
3262         if (remaining != 0)
3263         {
3264                 ereport(emode_for_corrupt_record(emode, recptr),
3265                                 (errmsg("incorrect total length in record at %X/%X",
3266                                                 (uint32) (recptr >> 32), (uint32) recptr)));
3267                 return false;
3268         }
3269
3270         /* Finally include the record header */
3271         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
3272         FIN_CRC32(crc);
3273
3274         if (!EQ_CRC32(record->xl_crc, crc))
3275         {
3276                 ereport(emode_for_corrupt_record(emode, recptr),
3277                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3278                                 (uint32) (recptr >> 32), (uint32) recptr)));
3279                 return false;
3280         }
3281
3282         return true;
3283 }
3284
3285 /*
3286  * Attempt to read an XLOG record.
3287  *
3288  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3289  * try to read a record just after the last one previously read.
3290  *
3291  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3292  * (emode must be either PANIC, LOG)
3293  *
3294  * The record is copied into readRecordBuf, so that on successful return,
3295  * the returned record pointer always points there.
3296  */
3297 static XLogRecord *
3298 ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3299 {
3300         XLogRecord *record;
3301         XLogRecPtr      tmpRecPtr = EndRecPtr;
3302         bool            randAccess = false;
3303         uint32          len,
3304                                 total_len;
3305         uint32          targetRecOff;
3306         uint32          pageHeaderSize;
3307         bool            gotheader;
3308
3309         if (readBuf == NULL)
3310         {
3311                 /*
3312                  * First time through, permanently allocate readBuf.  We do it this
3313                  * way, rather than just making a static array, for two reasons: (1)
3314                  * no need to waste the storage in most instantiations of the backend;
3315                  * (2) a static char array isn't guaranteed to have any particular
3316                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3317                  */
3318                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3319                 Assert(readBuf != NULL);
3320         }
3321
3322         if (RecPtr == NULL)
3323         {
3324                 RecPtr = &tmpRecPtr;
3325
3326                 /*
3327                  * RecPtr is pointing to end+1 of the previous WAL record.  If
3328                  * we're at a page boundary, no more records can fit on the current
3329                  * page. We must skip over the page header, but we can't do that
3330                  * until we've read in the page, since the header size is variable.
3331                  */
3332         }
3333         else
3334         {
3335                 /*
3336                  * In this case, the passed-in record pointer should already be
3337                  * pointing to a valid record starting position.
3338                  */
3339                 if (!XRecOffIsValid(*RecPtr))
3340                         ereport(PANIC,
3341                                         (errmsg("invalid record offset at %X/%X",
3342                                                         (uint32) (*RecPtr >> 32), (uint32) *RecPtr)));
3343
3344                 /*
3345                  * Since we are going to a random position in WAL, forget any prior
3346                  * state about what timeline we were in, and allow it to be any
3347                  * timeline in expectedTLEs.  We also set a flag to allow curFileTLI
3348                  * to go backwards (but we can't reset that variable right here, since
3349                  * we might not change files at all).
3350                  */
3351                 /* see comment in ValidXLogPageHeader */
3352                 lastPageTLI = lastSegmentTLI = 0;
3353                 randAccess = true;              /* allow curFileTLI to go backwards too */
3354         }
3355
3356         /* This is the first try to read this page. */
3357         lastSourceFailed = false;
3358 retry:
3359         /* Read the page containing the record */
3360         if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
3361                 return NULL;
3362
3363         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3364         targetRecOff = (*RecPtr) % XLOG_BLCKSZ;
3365         if (targetRecOff == 0)
3366         {
3367                 /*
3368                  * At page start, so skip over page header.  The Assert checks that
3369                  * we're not scribbling on caller's record pointer; it's OK because we
3370                  * can only get here in the continuing-from-prev-record case, since
3371                  * XRecOffIsValid rejected the zero-page-offset case otherwise.
3372                  */
3373                 Assert(RecPtr == &tmpRecPtr);
3374                 (*RecPtr) += pageHeaderSize;
3375                 targetRecOff = pageHeaderSize;
3376         }
3377         else if (targetRecOff < pageHeaderSize)
3378         {
3379                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3380                                 (errmsg("invalid record offset at %X/%X",
3381                                                 (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3382                 goto next_record_is_invalid;
3383         }
3384         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3385                 targetRecOff == pageHeaderSize)
3386         {
3387                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3388                                 (errmsg("contrecord is requested by %X/%X",
3389                                                 (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3390                 goto next_record_is_invalid;
3391         }
3392
3393         /*
3394          * Read the record length.
3395          *
3396          * NB: Even though we use an XLogRecord pointer here, the whole record
3397          * header might not fit on this page. xl_tot_len is the first field of
3398          * the struct, so it must be on this page (the records are MAXALIGNed),
3399          * but we cannot access any other fields until we've verified that we
3400          * got the whole header.
3401          */
3402         record = (XLogRecord *) (readBuf + (*RecPtr) % XLOG_BLCKSZ);
3403         total_len = record->xl_tot_len;
3404
3405         /*
3406          * If the whole record header is on this page, validate it immediately.
3407          * Otherwise do just a basic sanity check on xl_tot_len, and validate the
3408          * rest of the header after reading it from the next page.  The xl_tot_len
3409          * check is necessary here to ensure that we enter the "Need to reassemble
3410          * record" code path below; otherwise we might fail to apply
3411          * ValidXLogRecordHeader at all.
3412          */
3413         if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord)
3414         {
3415                 if (!ValidXLogRecordHeader(RecPtr, record, emode, randAccess))
3416                         goto next_record_is_invalid;
3417                 gotheader = true;
3418         }
3419         else
3420         {
3421                 if (total_len < SizeOfXLogRecord)
3422                 {
3423                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3424                                         (errmsg("invalid record length at %X/%X",
3425                                                         (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3426                         goto next_record_is_invalid;
3427                 }
3428                 gotheader = false;
3429         }
3430
3431         /*
3432          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3433          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3434          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3435          * enough for all "normal" records, but very large commit or abort records
3436          * might need more space.)
3437          */
3438         if (total_len > readRecordBufSize)
3439         {
3440                 uint32          newSize = total_len;
3441
3442                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3443                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3444                 if (readRecordBuf)
3445                         free(readRecordBuf);
3446                 readRecordBuf = (char *) malloc(newSize);
3447                 if (!readRecordBuf)
3448                 {
3449                         readRecordBufSize = 0;
3450                         /* We treat this as a "bogus data" condition */
3451                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3452                                         (errmsg("record length %u at %X/%X too long",
3453                                                         total_len, (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3454                         goto next_record_is_invalid;
3455                 }
3456                 readRecordBufSize = newSize;
3457         }
3458
3459         len = XLOG_BLCKSZ - (*RecPtr) % XLOG_BLCKSZ;
3460         if (total_len > len)
3461         {
3462                 /* Need to reassemble record */
3463                 char       *contrecord;
3464                 XLogPageHeader pageHeader;
3465                 XLogRecPtr      pagelsn;
3466                 char       *buffer;
3467                 uint32          gotlen;
3468
3469                 /* Initialize pagelsn to the beginning of the page this record is on */
3470                 pagelsn = ((*RecPtr) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3471
3472                 /* Copy the first fragment of the record from the first page. */
3473                 memcpy(readRecordBuf, readBuf + (*RecPtr) % XLOG_BLCKSZ, len);
3474                 buffer = readRecordBuf + len;
3475                 gotlen = len;
3476
3477                 do
3478                 {
3479                         /* Calculate pointer to beginning of next page */
3480                         XLByteAdvance(pagelsn, XLOG_BLCKSZ);
3481                         /* Wait for the next page to become available */
3482                         if (!XLogPageRead(&pagelsn, emode, false, false))
3483                                 return NULL;
3484
3485                         /* Check that the continuation on next page looks valid */
3486                         pageHeader = (XLogPageHeader) readBuf;
3487                         if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
3488                         {
3489                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3490                                                 (errmsg("there is no contrecord flag in log segment %s, offset %u",
3491                                                                 XLogFileNameP(curFileTLI, readSegNo),
3492                                                                 readOff)));
3493                                 goto next_record_is_invalid;
3494                         }
3495                         /*
3496                          * Cross-check that xlp_rem_len agrees with how much of the record
3497                          * we expect there to be left.
3498                          */
3499                         if (pageHeader->xlp_rem_len == 0 ||
3500                                 total_len != (pageHeader->xlp_rem_len + gotlen))
3501                         {
3502                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3503                                                 (errmsg("invalid contrecord length %u in log segment %s, offset %u",
3504                                                                 pageHeader->xlp_rem_len,
3505                                                                 XLogFileNameP(curFileTLI, readSegNo),
3506                                                                 readOff)));
3507                                 goto next_record_is_invalid;
3508                         }
3509
3510                         /* Append the continuation from this page to the buffer */
3511                         pageHeaderSize = XLogPageHeaderSize(pageHeader);
3512                         contrecord = (char *) readBuf + pageHeaderSize;
3513                         len = XLOG_BLCKSZ - pageHeaderSize;
3514                         if (pageHeader->xlp_rem_len < len)
3515                                 len = pageHeader->xlp_rem_len;
3516                         memcpy(buffer, (char *) contrecord, len);
3517                         buffer += len;
3518                         gotlen += len;
3519
3520                         /* If we just reassembled the record header, validate it. */
3521                         if (!gotheader)
3522                         {
3523                                 record = (XLogRecord *) readRecordBuf;
3524                                 if (!ValidXLogRecordHeader(RecPtr, record, emode, randAccess))
3525                                         goto next_record_is_invalid;
3526                                 gotheader = true;
3527                         }
3528                 } while (pageHeader->xlp_rem_len > len);
3529
3530                 record = (XLogRecord *) readRecordBuf;
3531                 if (!RecordIsValid(record, *RecPtr, emode))
3532                         goto next_record_is_invalid;
3533                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3534                 XLogSegNoOffsetToRecPtr(
3535                         readSegNo,
3536                         readOff + pageHeaderSize + MAXALIGN(pageHeader->xlp_rem_len),
3537                         EndRecPtr);
3538                 ReadRecPtr = *RecPtr;
3539         }
3540         else
3541         {
3542                 /* Record does not cross a page boundary */
3543                 if (!RecordIsValid(record, *RecPtr, emode))
3544                         goto next_record_is_invalid;
3545                 EndRecPtr = *RecPtr + MAXALIGN(total_len);
3546
3547                 ReadRecPtr = *RecPtr;
3548                 memcpy(readRecordBuf, record, total_len);
3549         }
3550
3551         /*
3552          * Special processing if it's an XLOG SWITCH record
3553          */
3554         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3555         {
3556                 /* Pretend it extends to end of segment */
3557                 EndRecPtr += XLogSegSize - 1;
3558                 EndRecPtr -= EndRecPtr % XLogSegSize;
3559
3560                 /*
3561                  * Pretend that readBuf contains the last page of the segment. This is
3562                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3563                  * segment.
3564                  */
3565                 readOff = XLogSegSize - XLOG_BLCKSZ;
3566         }
3567         return record;
3568
3569 next_record_is_invalid:
3570         lastSourceFailed = true;
3571
3572         if (readFile >= 0)
3573         {
3574                 close(readFile);
3575                 readFile = -1;
3576         }
3577
3578         /* In standby-mode, keep trying */
3579         if (StandbyMode)
3580                 goto retry;
3581         else
3582                 return NULL;
3583 }
3584
3585 /*
3586  * Check whether the xlog header of a page just read in looks valid.
3587  *
3588  * This is just a convenience subroutine to avoid duplicated code in
3589  * ReadRecord.  It's not intended for use from anywhere else.
3590  */
3591 static bool
3592 ValidXLogPageHeader(XLogPageHeader hdr, int emode, bool segmentonly)
3593 {
3594         XLogRecPtr      recaddr;
3595
3596         XLogSegNoOffsetToRecPtr(readSegNo, readOff, recaddr);
3597
3598         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3599         {
3600                 ereport(emode_for_corrupt_record(emode, recaddr),
3601                                 (errmsg("invalid magic number %04X in log segment %s, offset %u",
3602                                                 hdr->xlp_magic,
3603                                                 XLogFileNameP(curFileTLI, readSegNo),
3604                                                 readOff)));
3605                 return false;
3606         }
3607         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3608         {
3609                 ereport(emode_for_corrupt_record(emode, recaddr),
3610                                 (errmsg("invalid info bits %04X in log segment %s, offset %u",
3611                                                 hdr->xlp_info,
3612                                                 XLogFileNameP(curFileTLI, readSegNo),
3613                                                 readOff)));
3614                 return false;
3615         }
3616         if (hdr->xlp_info & XLP_LONG_HEADER)
3617         {
3618                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3619
3620                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3621                 {
3622                         char            fhdrident_str[32];
3623                         char            sysident_str[32];
3624
3625                         /*
3626                          * Format sysids separately to keep platform-dependent format code
3627                          * out of the translatable message string.
3628                          */
3629                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3630                                          longhdr->xlp_sysid);
3631                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3632                                          ControlFile->system_identifier);
3633                         ereport(emode_for_corrupt_record(emode, recaddr),
3634                                         (errmsg("WAL file is from different database system"),
3635                                          errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
3636                                                            fhdrident_str, sysident_str)));
3637                         return false;
3638                 }
3639                 if (longhdr->xlp_seg_size != XLogSegSize)
3640                 {
3641                         ereport(emode_for_corrupt_record(emode, recaddr),
3642                                         (errmsg("WAL file is from different database system"),
3643                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3644                         return false;
3645                 }
3646                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3647                 {
3648                         ereport(emode_for_corrupt_record(emode, recaddr),
3649                                         (errmsg("WAL file is from different database system"),
3650                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3651                         return false;
3652                 }
3653         }
3654         else if (readOff == 0)
3655         {
3656                 /* hmm, first page of file doesn't have a long header? */
3657                 ereport(emode_for_corrupt_record(emode, recaddr),
3658                                 (errmsg("invalid info bits %04X in log segment %s, offset %u",
3659                                                 hdr->xlp_info,
3660                                                 XLogFileNameP(curFileTLI, readSegNo),
3661                                                 readOff)));
3662                 return false;
3663         }
3664
3665         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3666         {
3667                 ereport(emode_for_corrupt_record(emode, recaddr),
3668                                 (errmsg("unexpected pageaddr %X/%X in log segment %s, offset %u",
3669                                                 (uint32) (hdr->xlp_pageaddr >> 32), (uint32) hdr->xlp_pageaddr,
3670                                                 XLogFileNameP(curFileTLI, readSegNo),
3671                                                 readOff)));
3672                 return false;
3673         }
3674
3675         /*
3676          * Check page TLI is one of the expected values.
3677          */
3678         if (!tliInHistory(hdr->xlp_tli, expectedTLEs))
3679         {
3680                 ereport(emode_for_corrupt_record(emode, recaddr),
3681                                 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3682                                                 hdr->xlp_tli,
3683                                                 XLogFileNameP(curFileTLI, readSegNo),
3684                                                 readOff)));
3685                 return false;
3686         }
3687
3688         /*
3689          * Since child timelines are always assigned a TLI greater than their
3690          * immediate parent's TLI, we should never see TLI go backwards across
3691          * successive pages of a consistent WAL sequence.
3692          *
3693          * Of course this check should only be applied when advancing sequentially
3694          * across pages; therefore ReadRecord resets lastPageTLI and
3695          * lastSegmentTLI to zero when going to a random page.
3696          *
3697          * Sometimes we re-open a segment that's already been partially replayed.
3698          * In that case we cannot perform the normal TLI check: if there is a
3699          * timeline switch within the segment, the first page has a smaller TLI
3700          * than later pages following the timeline switch, and we might've read
3701          * them already. As a weaker test, we still check that it's not smaller
3702          * than the TLI we last saw at the beginning of a segment. Pass
3703          * segmentonly = true when re-validating the first page like that, and the
3704          * page you're actually interested in comes later.
3705          */
3706         if (hdr->xlp_tli < (segmentonly ? lastSegmentTLI : lastPageTLI))
3707         {
3708                 ereport(emode_for_corrupt_record(emode, recaddr),
3709                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u",
3710                                                 hdr->xlp_tli,
3711                                                 segmentonly ? lastSegmentTLI : lastPageTLI,
3712                                                 XLogFileNameP(curFileTLI, readSegNo),
3713                                                 readOff)));
3714                 return false;
3715         }
3716         lastPageTLI = hdr->xlp_tli;
3717         if (readOff == 0)
3718                 lastSegmentTLI = hdr->xlp_tli;
3719
3720         return true;
3721 }
3722
3723 /*
3724  * Validate an XLOG record header.
3725  *
3726  * This is just a convenience subroutine to avoid duplicated code in
3727  * ReadRecord.  It's not intended for use from anywhere else.
3728  */
3729 static bool
3730 ValidXLogRecordHeader(XLogRecPtr *RecPtr, XLogRecord *record, int emode,
3731                                           bool randAccess)
3732 {
3733         /*
3734          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3735          * required.
3736          */
3737         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3738         {
3739                 if (record->xl_len != 0)
3740                 {
3741                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3742                                         (errmsg("invalid xlog switch record at %X/%X",
3743                                                         (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3744                         return false;
3745                 }
3746         }
3747         else if (record->xl_len == 0)
3748         {
3749                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3750                                 (errmsg("record with zero length at %X/%X",
3751                                                 (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3752                 return false;
3753         }
3754         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3755                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3756                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3757         {
3758                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3759                                 (errmsg("invalid record length at %X/%X",
3760                                                 (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3761                 return false;
3762         }
3763         if (record->xl_rmid > RM_MAX_ID)
3764         {
3765                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3766                                 (errmsg("invalid resource manager ID %u at %X/%X",
3767                                                 record->xl_rmid, (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3768                 return false;
3769         }
3770         if (randAccess)
3771         {
3772                 /*
3773                  * We can't exactly verify the prev-link, but surely it should be less
3774                  * than the record's own address.
3775                  */
3776                 if (!XLByteLT(record->xl_prev, *RecPtr))
3777                 {
3778                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3779                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3780                                                         (uint32) (record->xl_prev >> 32), (uint32) record->xl_prev,
3781                                                         (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3782                         return false;
3783                 }
3784         }
3785         else
3786         {
3787                 /*
3788                  * Record's prev-link should exactly match our previous location. This
3789                  * check guards against torn WAL pages where a stale but valid-looking
3790                  * WAL record starts on a sector boundary.
3791                  */
3792                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3793                 {
3794                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3795                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3796                                                         (uint32) (record->xl_prev >> 32), (uint32) record->xl_prev,
3797                                                         (uint32) ((*RecPtr) >> 32), (uint32) *RecPtr)));
3798                         return false;
3799                 }
3800         }
3801
3802         return true;
3803 }
3804
3805 /*
3806  * Scan for new timelines that might have appeared in the archive since we
3807  * started recovery.
3808  *
3809  * If there are any, the function changes recovery target TLI to the latest
3810  * one and returns 'true'.
3811  */
3812 static bool
3813 rescanLatestTimeLine(void)
3814 {
3815         List       *newExpectedTLEs;
3816         bool            found;
3817         ListCell   *cell;
3818         TimeLineID      newtarget;
3819         TimeLineHistoryEntry *currentTle = NULL;
3820         /* use volatile pointer to prevent code rearrangement */
3821         volatile XLogCtlData *xlogctl = XLogCtl;
3822
3823         newtarget = findNewestTimeLine(recoveryTargetTLI);
3824         if (newtarget == recoveryTargetTLI)
3825         {
3826                 /* No new timelines found */
3827                 return false;
3828         }
3829
3830         /*
3831          * Determine the list of expected TLIs for the new TLI
3832          */
3833
3834         newExpectedTLEs = readTimeLineHistory(newtarget);
3835
3836         /*
3837          * If the current timeline is not part of the history of the new
3838          * timeline, we cannot proceed to it.
3839          */
3840         found = false;
3841         foreach (cell, newExpectedTLEs)
3842         {
3843                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3844
3845                 if (currentTle->tli == recoveryTargetTLI)
3846                 {
3847                         found = true;
3848                         break;
3849                 }
3850         }
3851         if (!found)
3852         {
3853                 ereport(LOG,
3854                                 (errmsg("new timeline %u is not a child of database system timeline %u",
3855                                                 newtarget,
3856                                                 ThisTimeLineID)));
3857                 return false;
3858         }
3859
3860         /*
3861          * The current timeline was found in the history file, but check that the
3862          * next timeline was forked off from it *after* the current recovery
3863          * location.
3864          */
3865         if (XLByteLT(currentTle->end, EndRecPtr))
3866         {
3867                 ereport(LOG,
3868                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
3869                                                 newtarget,
3870                                                 ThisTimeLineID,
3871                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
3872                 return false;
3873         }
3874
3875         /* The new timeline history seems valid. Switch target */
3876         recoveryTargetTLI = newtarget;
3877         list_free_deep(expectedTLEs);
3878         expectedTLEs = newExpectedTLEs;
3879
3880         SpinLockAcquire(&xlogctl->info_lck);
3881         xlogctl->RecoveryTargetTLI = recoveryTargetTLI;
3882         SpinLockRelease(&xlogctl->info_lck);
3883
3884         ereport(LOG,
3885                         (errmsg("new target timeline is %u",
3886                                         recoveryTargetTLI)));
3887
3888         /*
3889          * Wake up any walsenders to notice that we have a new target timeline.
3890          */
3891         if (AllowCascadeReplication())
3892                 WalSndWakeup();
3893
3894         return true;
3895 }
3896
3897 /*
3898  * I/O routines for pg_control
3899  *
3900  * *ControlFile is a buffer in shared memory that holds an image of the
3901  * contents of pg_control.      WriteControlFile() initializes pg_control
3902  * given a preloaded buffer, ReadControlFile() loads the buffer from
3903  * the pg_control file (during postmaster or standalone-backend startup),
3904  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3905  *
3906  * For simplicity, WriteControlFile() initializes the fields of pg_control
3907  * that are related to checking backend/database compatibility, and
3908  * ReadControlFile() verifies they are correct.  We could split out the
3909  * I/O and compatibility-check functions, but there seems no need currently.
3910  */
3911 static void
3912 WriteControlFile(void)
3913 {
3914         int                     fd;
3915         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3916
3917         /*
3918          * Initialize version and compatibility-check fields
3919          */
3920         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3921         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3922
3923         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3924         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3925
3926         ControlFile->blcksz = BLCKSZ;
3927         ControlFile->relseg_size = RELSEG_SIZE;
3928         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3929         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3930
3931         ControlFile->nameDataLen = NAMEDATALEN;
3932         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3933
3934         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3935
3936 #ifdef HAVE_INT64_TIMESTAMP
3937         ControlFile->enableIntTimes = true;
3938 #else
3939         ControlFile->enableIntTimes = false;
3940 #endif
3941         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3942         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3943
3944         /* Contents are protected with a CRC */
3945         INIT_CRC32(ControlFile->crc);
3946         COMP_CRC32(ControlFile->crc,
3947                            (char *) ControlFile,
3948                            offsetof(ControlFileData, crc));
3949         FIN_CRC32(ControlFile->crc);
3950
3951         /*
3952          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3953          * excess over sizeof(ControlFileData).  This reduces the odds of
3954          * premature-EOF errors when reading pg_control.  We'll still fail when we
3955          * check the contents of the file, but hopefully with a more specific
3956          * error than "couldn't read pg_control".
3957          */
3958         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3959                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3960
3961         memset(buffer, 0, PG_CONTROL_SIZE);
3962         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3963
3964         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3965                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3966                                            S_IRUSR | S_IWUSR);
3967         if (fd < 0)
3968                 ereport(PANIC,
3969                                 (errcode_for_file_access(),
3970                                  errmsg("could not create control file \"%s\": %m",
3971                                                 XLOG_CONTROL_FILE)));
3972
3973         errno = 0;
3974         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3975         {
3976                 /* if write didn't set errno, assume problem is no disk space */
3977                 if (errno == 0)
3978                         errno = ENOSPC;
3979                 ereport(PANIC,
3980                                 (errcode_for_file_access(),
3981                                  errmsg("could not write to control file: %m")));
3982         }
3983
3984         if (pg_fsync(fd) != 0)
3985                 ereport(PANIC,
3986                                 (errcode_for_file_access(),
3987                                  errmsg("could not fsync control file: %m")));
3988
3989         if (close(fd))
3990                 ereport(PANIC,
3991                                 (errcode_for_file_access(),
3992                                  errmsg("could not close control file: %m")));
3993 }
3994
3995 static void
3996 ReadControlFile(void)
3997 {
3998         pg_crc32        crc;
3999         int                     fd;
4000
4001         /*
4002          * Read data...
4003          */
4004         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4005                                            O_RDWR | PG_BINARY,
4006                                            S_IRUSR | S_IWUSR);
4007         if (fd < 0)
4008                 ereport(PANIC,
4009                                 (errcode_for_file_access(),
4010                                  errmsg("could not open control file \"%s\": %m",
4011                                                 XLOG_CONTROL_FILE)));
4012
4013         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4014                 ereport(PANIC,
4015                                 (errcode_for_file_access(),
4016                                  errmsg("could not read from control file: %m")));
4017
4018         close(fd);
4019
4020         /*
4021          * Check for expected pg_control format version.  If this is wrong, the
4022          * CRC check will likely fail because we'll be checking the wrong number
4023          * of bytes.  Complaining about wrong version will probably be more
4024          * enlightening than complaining about wrong CRC.
4025          */
4026
4027         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4028                 ereport(FATAL,
4029                                 (errmsg("database files are incompatible with server"),
4030                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4031                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4032                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4033                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4034                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4035
4036         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4037                 ereport(FATAL,
4038                                 (errmsg("database files are incompatible with server"),
4039                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4040                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4041                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4042                                  errhint("It looks like you need to initdb.")));
4043
4044         /* Now check the CRC. */
4045         INIT_CRC32(crc);
4046         COMP_CRC32(crc,
4047                            (char *) ControlFile,
4048                            offsetof(ControlFileData, crc));
4049         FIN_CRC32(crc);
4050
4051         if (!EQ_CRC32(crc, ControlFile->crc))
4052                 ereport(FATAL,
4053                                 (errmsg("incorrect checksum in control file")));
4054
4055         /*
4056          * Do compatibility checking immediately.  If the database isn't
4057          * compatible with the backend executable, we want to abort before we can
4058          * possibly do any damage.
4059          */
4060         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4061                 ereport(FATAL,
4062                                 (errmsg("database files are incompatible with server"),
4063                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4064                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4065                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4066                                  errhint("It looks like you need to initdb.")));
4067         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4068                 ereport(FATAL,
4069                                 (errmsg("database files are incompatible with server"),
4070                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4071                                          " but the server was compiled with MAXALIGN %d.",
4072                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4073                                  errhint("It looks like you need to initdb.")));
4074         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4075                 ereport(FATAL,
4076                                 (errmsg("database files are incompatible with server"),
4077                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4078                                  errhint("It looks like you need to initdb.")));
4079         if (ControlFile->blcksz != BLCKSZ)
4080                 ereport(FATAL,
4081                                 (errmsg("database files are incompatible with server"),
4082                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4083                                            " but the server was compiled with BLCKSZ %d.",
4084                                            ControlFile->blcksz, BLCKSZ),
4085                                  errhint("It looks like you need to recompile or initdb.")));
4086         if (ControlFile->relseg_size != RELSEG_SIZE)
4087                 ereport(FATAL,
4088                                 (errmsg("database files are incompatible with server"),
4089                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4090                                   " but the server was compiled with RELSEG_SIZE %d.",
4091                                   ControlFile->relseg_size, RELSEG_SIZE),
4092                                  errhint("It looks like you need to recompile or initdb.")));
4093         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4094                 ereport(FATAL,
4095                                 (errmsg("database files are incompatible with server"),
4096                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4097                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4098                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4099                                  errhint("It looks like you need to recompile or initdb.")));
4100         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4101                 ereport(FATAL,
4102                                 (errmsg("database files are incompatible with server"),
4103                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4104                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4105                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4106                                  errhint("It looks like you need to recompile or initdb.")));
4107         if (ControlFile->nameDataLen != NAMEDATALEN)
4108                 ereport(FATAL,
4109                                 (errmsg("database files are incompatible with server"),
4110                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4111                                   " but the server was compiled with NAMEDATALEN %d.",
4112                                   ControlFile->nameDataLen, NAMEDATALEN),
4113                                  errhint("It looks like you need to recompile or initdb.")));
4114         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4115                 ereport(FATAL,
4116                                 (errmsg("database files are incompatible with server"),
4117                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4118                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4119                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4120                                  errhint("It looks like you need to recompile or initdb.")));
4121         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4122                 ereport(FATAL,
4123                                 (errmsg("database files are incompatible with server"),
4124                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4125                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4126                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4127                                  errhint("It looks like you need to recompile or initdb.")));
4128
4129 #ifdef HAVE_INT64_TIMESTAMP
4130         if (ControlFile->enableIntTimes != true)
4131                 ereport(FATAL,
4132                                 (errmsg("database files are incompatible with server"),
4133                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4134                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4135                                  errhint("It looks like you need to recompile or initdb.")));
4136 #else
4137         if (ControlFile->enableIntTimes != false)
4138                 ereport(FATAL,
4139                                 (errmsg("database files are incompatible with server"),
4140                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4141                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4142                                  errhint("It looks like you need to recompile or initdb.")));
4143 #endif
4144
4145 #ifdef USE_FLOAT4_BYVAL
4146         if (ControlFile->float4ByVal != true)
4147                 ereport(FATAL,
4148                                 (errmsg("database files are incompatible with server"),
4149                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4150                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4151                                  errhint("It looks like you need to recompile or initdb.")));
4152 #else
4153         if (ControlFile->float4ByVal != false)
4154                 ereport(FATAL,
4155                                 (errmsg("database files are incompatible with server"),
4156                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4157                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4158                                  errhint("It looks like you need to recompile or initdb.")));
4159 #endif
4160
4161 #ifdef USE_FLOAT8_BYVAL
4162         if (ControlFile->float8ByVal != true)
4163                 ereport(FATAL,
4164                                 (errmsg("database files are incompatible with server"),
4165                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4166                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4167                                  errhint("It looks like you need to recompile or initdb.")));
4168 #else
4169         if (ControlFile->float8ByVal != false)
4170                 ereport(FATAL,
4171                                 (errmsg("database files are incompatible with server"),
4172                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4173                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4174                                  errhint("It looks like you need to recompile or initdb.")));
4175 #endif
4176 }
4177
4178 void
4179 UpdateControlFile(void)
4180 {
4181         int                     fd;
4182
4183         INIT_CRC32(ControlFile->crc);
4184         COMP_CRC32(ControlFile->crc,
4185                            (char *) ControlFile,
4186                            offsetof(ControlFileData, crc));
4187         FIN_CRC32(ControlFile->crc);
4188
4189         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4190                                            O_RDWR | PG_BINARY,
4191                                            S_IRUSR | S_IWUSR);
4192         if (fd < 0)
4193                 ereport(PANIC,
4194                                 (errcode_for_file_access(),
4195                                  errmsg("could not open control file \"%s\": %m",
4196                                                 XLOG_CONTROL_FILE)));
4197
4198         errno = 0;
4199         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4200         {
4201                 /* if write didn't set errno, assume problem is no disk space */
4202                 if (errno == 0)
4203                         errno = ENOSPC;
4204                 ereport(PANIC,
4205                                 (errcode_for_file_access(),
4206                                  errmsg("could not write to control file: %m")));
4207         }
4208
4209         if (pg_fsync(fd) != 0)
4210                 ereport(PANIC,
4211                                 (errcode_for_file_access(),
4212                                  errmsg("could not fsync control file: %m")));
4213
4214         if (close(fd))
4215                 ereport(PANIC,
4216                                 (errcode_for_file_access(),
4217                                  errmsg("could not close control file: %m")));
4218 }
4219
4220 /*
4221  * Returns the unique system identifier from control file.
4222  */
4223 uint64
4224 GetSystemIdentifier(void)
4225 {
4226         Assert(ControlFile != NULL);
4227         return ControlFile->system_identifier;
4228 }
4229
4230 /*
4231  * Auto-tune the number of XLOG buffers.
4232  *
4233  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4234  * a maximum of one XLOG segment (there is little reason to think that more
4235  * is helpful, at least so long as we force an fsync when switching log files)
4236  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4237  * 9.1, when auto-tuning was added).
4238  *
4239  * This should not be called until NBuffers has received its final value.
4240  */
4241 static int
4242 XLOGChooseNumBuffers(void)
4243 {
4244         int                     xbuffers;
4245
4246         xbuffers = NBuffers / 32;
4247         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4248                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4249         if (xbuffers < 8)
4250                 xbuffers = 8;
4251         return xbuffers;
4252 }
4253
4254 /*
4255  * GUC check_hook for wal_buffers
4256  */
4257 bool
4258 check_wal_buffers(int *newval, void **extra, GucSource source)
4259 {
4260         /*
4261          * -1 indicates a request for auto-tune.
4262          */
4263         if (*newval == -1)
4264         {
4265                 /*
4266                  * If we haven't yet changed the boot_val default of -1, just let it
4267                  * be.  We'll fix it when XLOGShmemSize is called.
4268                  */
4269                 if (XLOGbuffers == -1)
4270                         return true;
4271
4272                 /* Otherwise, substitute the auto-tune value */
4273                 *newval = XLOGChooseNumBuffers();
4274         }
4275
4276         /*
4277          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4278          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4279          * the case, we just silently treat such values as a request for the
4280          * minimum.  (We could throw an error instead, but that doesn't seem very
4281          * helpful.)
4282          */
4283         if (*newval < 4)
4284                 *newval = 4;
4285
4286         return true;
4287 }
4288
4289 /*
4290  * Initialization of shared memory for XLOG
4291  */
4292 Size
4293 XLOGShmemSize(void)
4294 {
4295         Size            size;
4296
4297         /*
4298          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4299          * This isn't an amazingly clean place to do this, but we must wait till
4300          * NBuffers has received its final value, and must do it before using the
4301          * value of XLOGbuffers to do anything important.
4302          */
4303         if (XLOGbuffers == -1)
4304         {
4305                 char            buf[32];
4306
4307                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4308                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4309         }
4310         Assert(XLOGbuffers > 0);
4311
4312         /* XLogCtl */
4313         size = sizeof(XLogCtlData);
4314         /* xlblocks array */
4315         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4316         /* extra alignment padding for XLOG I/O buffers */
4317         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4318         /* and the buffers themselves */
4319         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4320
4321         /*
4322          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4323          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4324          * routine again below to compute the actual allocation size.
4325          */
4326
4327         return size;
4328 }
4329
4330 void
4331 XLOGShmemInit(void)
4332 {
4333         bool            foundCFile,
4334                                 foundXLog;
4335         char       *allocptr;
4336
4337         ControlFile = (ControlFileData *)
4338                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4339         XLogCtl = (XLogCtlData *)
4340                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4341
4342         if (foundCFile || foundXLog)
4343         {
4344                 /* both should be present or neither */
4345                 Assert(foundCFile && foundXLog);
4346                 return;
4347         }
4348
4349         memset(XLogCtl, 0, sizeof(XLogCtlData));
4350
4351         /*
4352          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4353          * multiple of the alignment for same, so no extra alignment padding is
4354          * needed here.
4355          */
4356         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4357         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4358         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4359         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4360
4361         /*
4362          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4363          */
4364         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4365         XLogCtl->pages = allocptr;
4366         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4367
4368         /*
4369          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4370          * in additional info.)
4371          */
4372         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4373         XLogCtl->SharedRecoveryInProgress = true;
4374         XLogCtl->SharedHotStandbyActive = false;
4375         XLogCtl->WalWriterSleeping = false;
4376         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4377         SpinLockInit(&XLogCtl->info_lck);
4378         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4379
4380         /*
4381          * If we are not in bootstrap mode, pg_control should already exist. Read
4382          * and validate it immediately (see comments in ReadControlFile() for the
4383          * reasons why).
4384          */
4385         if (!IsBootstrapProcessingMode())
4386                 ReadControlFile();
4387 }
4388
4389 /*
4390  * This func must be called ONCE on system install.  It creates pg_control
4391  * and the initial XLOG segment.
4392  */
4393 void
4394 BootStrapXLOG(void)
4395 {
4396         CheckPoint      checkPoint;
4397         char       *buffer;
4398         XLogPageHeader page;
4399         XLogLongPageHeader longpage;
4400         XLogRecord *record;
4401         bool            use_existent;
4402         uint64          sysidentifier;
4403         struct timeval tv;
4404         pg_crc32        crc;
4405
4406         /*
4407          * Select a hopefully-unique system identifier code for this installation.
4408          * We use the result of gettimeofday(), including the fractional seconds
4409          * field, as being about as unique as we can easily get.  (Think not to
4410          * use random(), since it hasn't been seeded and there's no portable way
4411          * to seed it other than the system clock value...)  The upper half of the
4412          * uint64 value is just the tv_sec part, while the lower half is the XOR
4413          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4414          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4415          * knowing this encoding can determine the initialization time of the
4416          * installation, which could perhaps be useful sometimes.
4417          */
4418         gettimeofday(&tv, NULL);
4419         sysidentifier = ((uint64) tv.tv_sec) << 32;
4420         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4421
4422         /* First timeline ID is always 1 */
4423         ThisTimeLineID = 1;
4424
4425         /* page buffer must be aligned suitably for O_DIRECT */
4426         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4427         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4428         memset(page, 0, XLOG_BLCKSZ);
4429
4430         /*
4431          * Set up information for the initial checkpoint record
4432          *
4433          * The initial checkpoint record is written to the beginning of the WAL
4434          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4435          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4436          */
4437         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4438         checkPoint.ThisTimeLineID = ThisTimeLineID;
4439         checkPoint.fullPageWrites = fullPageWrites;
4440         checkPoint.nextXidEpoch = 0;
4441         checkPoint.nextXid = FirstNormalTransactionId;
4442         checkPoint.nextOid = FirstBootstrapObjectId;
4443         checkPoint.nextMulti = FirstMultiXactId;
4444         checkPoint.nextMultiOffset = 0;
4445         checkPoint.oldestXid = FirstNormalTransactionId;
4446         checkPoint.oldestXidDB = TemplateDbOid;
4447         checkPoint.time = (pg_time_t) time(NULL);
4448         checkPoint.oldestActiveXid = InvalidTransactionId;
4449
4450         ShmemVariableCache->nextXid = checkPoint.nextXid;
4451         ShmemVariableCache->nextOid = checkPoint.nextOid;
4452         ShmemVariableCache->oidCount = 0;
4453         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4454         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4455
4456         /* Set up the XLOG page header */
4457         page->xlp_magic = XLOG_PAGE_MAGIC;
4458         page->xlp_info = XLP_LONG_HEADER;
4459         page->xlp_tli = ThisTimeLineID;
4460         page->xlp_pageaddr = XLogSegSize;
4461         longpage = (XLogLongPageHeader) page;
4462         longpage->xlp_sysid = sysidentifier;
4463         longpage->xlp_seg_size = XLogSegSize;
4464         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4465
4466         /* Insert the initial checkpoint record */
4467         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4468         record->xl_prev = 0;
4469         record->xl_xid = InvalidTransactionId;
4470         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4471         record->xl_len = sizeof(checkPoint);
4472         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4473         record->xl_rmid = RM_XLOG_ID;
4474         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4475
4476         INIT_CRC32(crc);
4477         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4478         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4479         FIN_CRC32(crc);
4480         record->xl_crc = crc;
4481
4482         /* Create first XLOG segment file */
4483         use_existent = false;
4484         openLogFile = XLogFileInit(1, &use_existent, false);
4485
4486         /* Write the first page with the initial record */
4487         errno = 0;
4488         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4489         {
4490                 /* if write didn't set errno, assume problem is no disk space */
4491                 if (errno == 0)
4492                         errno = ENOSPC;
4493                 ereport(PANIC,
4494                                 (errcode_for_file_access(),
4495                           errmsg("could not write bootstrap transaction log file: %m")));
4496         }
4497
4498         if (pg_fsync(openLogFile) != 0)
4499                 ereport(PANIC,
4500                                 (errcode_for_file_access(),
4501                           errmsg("could not fsync bootstrap transaction log file: %m")));
4502
4503         if (close(openLogFile))
4504                 ereport(PANIC,
4505                                 (errcode_for_file_access(),
4506                           errmsg("could not close bootstrap transaction log file: %m")));
4507
4508         openLogFile = -1;
4509
4510         /* Now create pg_control */
4511
4512         memset(ControlFile, 0, sizeof(ControlFileData));
4513         /* Initialize pg_control status fields */
4514         ControlFile->system_identifier = sysidentifier;
4515         ControlFile->state = DB_SHUTDOWNED;
4516         ControlFile->time = checkPoint.time;
4517         ControlFile->checkPoint = checkPoint.redo;
4518         ControlFile->checkPointCopy = checkPoint;
4519
4520         /* Set important parameter values for use when replaying WAL */
4521         ControlFile->MaxConnections = MaxConnections;
4522         ControlFile->max_prepared_xacts = max_prepared_xacts;
4523         ControlFile->max_locks_per_xact = max_locks_per_xact;
4524         ControlFile->wal_level = wal_level;
4525
4526         /* some additional ControlFile fields are set in WriteControlFile() */
4527
4528         WriteControlFile();
4529
4530         /* Bootstrap the commit log, too */
4531         BootStrapCLOG();
4532         BootStrapSUBTRANS();
4533         BootStrapMultiXact();
4534
4535         pfree(buffer);
4536 }
4537
4538 static char *
4539 str_time(pg_time_t tnow)
4540 {
4541         static char buf[128];
4542
4543         pg_strftime(buf, sizeof(buf),
4544                                 "%Y-%m-%d %H:%M:%S %Z",
4545                                 pg_localtime(&tnow, log_timezone));
4546
4547         return buf;
4548 }
4549
4550 /*
4551  * See if there is a recovery command file (recovery.conf), and if so
4552  * read in parameters for archive recovery and XLOG streaming.
4553  *
4554  * The file is parsed using the main configuration parser.
4555  */
4556 static void
4557 readRecoveryCommandFile(void)
4558 {
4559         FILE       *fd;
4560         TimeLineID      rtli = 0;
4561         bool            rtliGiven = false;
4562         ConfigVariable *item,
4563                            *head = NULL,
4564                            *tail = NULL;
4565
4566         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4567         if (fd == NULL)
4568         {
4569                 if (errno == ENOENT)
4570                         return;                         /* not there, so no archive recovery */
4571                 ereport(FATAL,
4572                                 (errcode_for_file_access(),
4573                                  errmsg("could not open recovery command file \"%s\": %m",
4574                                                 RECOVERY_COMMAND_FILE)));
4575         }
4576
4577         /*
4578          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
4579          * no need to check the return value.
4580          */
4581         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
4582
4583         FreeFile(fd);
4584
4585         for (item = head; item; item = item->next)
4586         {
4587                 if (strcmp(item->name, "restore_command") == 0)
4588                 {
4589                         recoveryRestoreCommand = pstrdup(item->value);
4590                         ereport(DEBUG2,
4591                                         (errmsg_internal("restore_command = '%s'",
4592                                                                          recoveryRestoreCommand)));
4593                 }
4594                 else if (strcmp(item->name, "recovery_end_command") == 0)
4595                 {
4596                         recoveryEndCommand = pstrdup(item->value);
4597                         ereport(DEBUG2,
4598                                         (errmsg_internal("recovery_end_command = '%s'",
4599                                                                          recoveryEndCommand)));
4600                 }
4601                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
4602                 {
4603                         archiveCleanupCommand = pstrdup(item->value);
4604                         ereport(DEBUG2,
4605                                         (errmsg_internal("archive_cleanup_command = '%s'",
4606                                                                          archiveCleanupCommand)));
4607                 }
4608                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
4609                 {
4610                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
4611                                 ereport(ERROR,
4612                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4613                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
4614                         ereport(DEBUG2,
4615                                         (errmsg_internal("pause_at_recovery_target = '%s'",
4616                                                                          item->value)));
4617                 }
4618                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
4619                 {
4620                         rtliGiven = true;
4621                         if (strcmp(item->value, "latest") == 0)
4622                                 rtli = 0;
4623                         else
4624                         {
4625                                 errno = 0;
4626                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
4627                                 if (errno == EINVAL || errno == ERANGE)
4628                                         ereport(FATAL,
4629                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4630                                                                         item->value)));
4631                         }
4632                         if (rtli)
4633                                 ereport(DEBUG2,
4634                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
4635                         else
4636                                 ereport(DEBUG2,
4637                                          (errmsg_internal("recovery_target_timeline = latest")));
4638                 }
4639                 else if (strcmp(item->name, "recovery_target_xid") == 0)
4640                 {
4641                         errno = 0;
4642                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
4643                         if (errno == EINVAL || errno == ERANGE)
4644                                 ereport(FATAL,
4645                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4646                                                  item->value)));
4647                         ereport(DEBUG2,
4648                                         (errmsg_internal("recovery_target_xid = %u",
4649                                                                          recoveryTargetXid)));
4650                         recoveryTarget = RECOVERY_TARGET_XID;
4651                 }
4652                 else if (strcmp(item->name, "recovery_target_time") == 0)
4653                 {
4654                         /*
4655                          * if recovery_target_xid or recovery_target_name specified, then
4656                          * this overrides recovery_target_time
4657                          */
4658                         if (recoveryTarget == RECOVERY_TARGET_XID ||
4659                                 recoveryTarget == RECOVERY_TARGET_NAME)
4660                                 continue;
4661                         recoveryTarget = RECOVERY_TARGET_TIME;
4662
4663                         /*
4664                          * Convert the time string given by the user to TimestampTz form.
4665                          */
4666                         recoveryTargetTime =
4667                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4668                                                                                                 CStringGetDatum(item->value),
4669                                                                                                 ObjectIdGetDatum(InvalidOid),
4670                                                                                                                 Int32GetDatum(-1)));
4671                         ereport(DEBUG2,
4672                                         (errmsg_internal("recovery_target_time = '%s'",
4673                                                                    timestamptz_to_str(recoveryTargetTime))));
4674                 }
4675                 else if (strcmp(item->name, "recovery_target_name") == 0)
4676                 {
4677                         /*
4678                          * if recovery_target_xid specified, then this overrides
4679                          * recovery_target_name
4680                          */
4681                         if (recoveryTarget == RECOVERY_TARGET_XID)
4682                                 continue;
4683                         recoveryTarget = RECOVERY_TARGET_NAME;
4684
4685                         recoveryTargetName = pstrdup(item->value);
4686                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
4687                                 ereport(FATAL,
4688                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4689                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
4690                                                                 MAXFNAMELEN - 1)));
4691
4692                         ereport(DEBUG2,
4693                                         (errmsg_internal("recovery_target_name = '%s'",
4694                                                                          recoveryTargetName)));
4695                 }
4696                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
4697                 {
4698                         /*
4699                          * does nothing if a recovery_target is not also set
4700                          */
4701                         if (!parse_bool(item->value, &recoveryTargetInclusive))
4702                                 ereport(ERROR,
4703                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4704                                                  errmsg("parameter \"%s\" requires a Boolean value",
4705                                                                 "recovery_target_inclusive")));
4706                         ereport(DEBUG2,
4707                                         (errmsg_internal("recovery_target_inclusive = %s",
4708                                                                          item->value)));
4709                 }
4710                 else if (strcmp(item->name, "standby_mode") == 0)
4711                 {
4712                         if (!parse_bool(item->value, &StandbyMode))
4713                                 ereport(ERROR,
4714                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4715                                                  errmsg("parameter \"%s\" requires a Boolean value",
4716                                                                 "standby_mode")));
4717                         ereport(DEBUG2,
4718                                         (errmsg_internal("standby_mode = '%s'", item->value)));
4719                 }
4720                 else if (strcmp(item->name, "primary_conninfo") == 0)
4721                 {
4722                         PrimaryConnInfo = pstrdup(item->value);
4723                         ereport(DEBUG2,
4724                                         (errmsg_internal("primary_conninfo = '%s'",
4725                                                                          PrimaryConnInfo)));
4726                 }
4727                 else if (strcmp(item->name, "trigger_file") == 0)
4728                 {
4729                         TriggerFile = pstrdup(item->value);
4730                         ereport(DEBUG2,
4731                                         (errmsg_internal("trigger_file = '%s'",
4732                                                                          TriggerFile)));
4733                 }
4734                 else
4735                         ereport(FATAL,
4736                                         (errmsg("unrecognized recovery parameter \"%s\"",
4737                                                         item->name)));
4738         }
4739
4740         /*
4741          * Check for compulsory parameters
4742          */
4743         if (StandbyMode)
4744         {
4745                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
4746                         ereport(WARNING,
4747                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
4748                                                         RECOVERY_COMMAND_FILE),
4749                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
4750         }
4751         else
4752         {
4753                 if (recoveryRestoreCommand == NULL)
4754                         ereport(FATAL,
4755                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
4756                                                         RECOVERY_COMMAND_FILE)));
4757         }
4758
4759         /* Enable fetching from archive recovery area */
4760         InArchiveRecovery = true;
4761
4762         /*
4763          * If user specified recovery_target_timeline, validate it or compute the
4764          * "latest" value.      We can't do this until after we've gotten the restore
4765          * command and set InArchiveRecovery, because we need to fetch timeline
4766          * history files from the archive.
4767          */
4768         if (rtliGiven)
4769         {
4770                 if (rtli)
4771                 {
4772                         /* Timeline 1 does not have a history file, all else should */
4773                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4774                                 ereport(FATAL,
4775                                                 (errmsg("recovery target timeline %u does not exist",
4776                                                                 rtli)));
4777                         recoveryTargetTLI = rtli;
4778                         recoveryTargetIsLatest = false;
4779                 }
4780                 else
4781                 {
4782                         /* We start the "latest" search from pg_control's timeline */
4783                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4784                         recoveryTargetIsLatest = true;
4785                 }
4786         }
4787
4788         FreeConfigVariables(head);
4789 }
4790
4791 /*
4792  * Exit archive-recovery state
4793  */
4794 static void
4795 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
4796 {
4797         char            recoveryPath[MAXPGPATH];
4798         char            xlogpath[MAXPGPATH];
4799
4800         /*
4801          * We are no longer in archive recovery state.
4802          */
4803         InArchiveRecovery = false;
4804
4805         /*
4806          * Update min recovery point one last time.
4807          */
4808         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4809
4810         /*
4811          * If the ending log segment is still open, close it (to avoid problems on
4812          * Windows with trying to rename or delete an open file).
4813          */
4814         if (readFile >= 0)
4815         {
4816                 close(readFile);
4817                 readFile = -1;
4818         }
4819
4820         /*
4821          * If we are establishing a new timeline, we have to copy data from the
4822          * last WAL segment of the old timeline to create a starting WAL segment
4823          * for the new timeline.
4824          *
4825          * Notify the archiver that the last WAL segment of the old timeline is
4826          * ready to copy to archival storage. Otherwise, it is not archived for a
4827          * while.
4828          */
4829         if (endTLI != ThisTimeLineID)
4830         {
4831                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
4832
4833                 if (XLogArchivingActive())
4834                 {
4835                         XLogFileName(xlogpath, endTLI, endLogSegNo);
4836                         XLogArchiveNotify(xlogpath);
4837                 }
4838         }
4839
4840         /*
4841          * Let's just make real sure there are not .ready or .done flags posted
4842          * for the new segment.
4843          */
4844         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
4845         XLogArchiveCleanup(xlogpath);
4846
4847         /*
4848          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
4849          * of it.
4850          */
4851         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4852         unlink(recoveryPath);           /* ignore any error */
4853
4854         /* Get rid of any remaining recovered timeline-history file, too */
4855         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4856         unlink(recoveryPath);           /* ignore any error */
4857
4858         /*
4859          * Rename the config file out of the way, so that we don't accidentally
4860          * re-enter archive recovery mode in a subsequent crash.
4861          */
4862         unlink(RECOVERY_COMMAND_DONE);
4863         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4864                 ereport(FATAL,
4865                                 (errcode_for_file_access(),
4866                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4867                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4868
4869         ereport(LOG,
4870                         (errmsg("archive recovery complete")));
4871 }
4872
4873 /*
4874  * For point-in-time recovery, this function decides whether we want to
4875  * stop applying the XLOG at or after the current record.
4876  *
4877  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4878  * *includeThis is set TRUE if we should apply this record before stopping.
4879  *
4880  * We also track the timestamp of the latest applied COMMIT/ABORT
4881  * record in XLogCtl->recoveryLastXTime, for logging purposes.
4882  * Also, some information is saved in recoveryStopXid et al for use in
4883  * annotating the new timeline's history file.
4884  */
4885 static bool
4886 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4887 {
4888         bool            stopsHere;
4889         uint8           record_info;
4890         TimestampTz recordXtime;
4891         char            recordRPName[MAXFNAMELEN];
4892
4893         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
4894         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
4895                 return false;
4896         record_info = record->xl_info & ~XLR_INFO_MASK;
4897         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
4898         {
4899                 xl_xact_commit_compact *recordXactCommitData;
4900
4901                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
4902                 recordXtime = recordXactCommitData->xact_time;
4903         }
4904         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
4905         {
4906                 xl_xact_commit *recordXactCommitData;
4907
4908                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4909                 recordXtime = recordXactCommitData->xact_time;
4910         }
4911         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
4912         {
4913                 xl_xact_abort *recordXactAbortData;
4914
4915                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4916                 recordXtime = recordXactAbortData->xact_time;
4917         }
4918         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
4919         {
4920                 xl_restore_point *recordRestorePointData;
4921
4922                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
4923                 recordXtime = recordRestorePointData->rp_time;
4924                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
4925         }
4926         else
4927                 return false;
4928
4929         /* Do we have a PITR target at all? */
4930         if (recoveryTarget == RECOVERY_TARGET_UNSET)
4931         {
4932                 /*
4933                  * Save timestamp of latest transaction commit/abort if this is a
4934                  * transaction record
4935                  */
4936                 if (record->xl_rmid == RM_XACT_ID)
4937                         SetLatestXTime(recordXtime);
4938                 return false;
4939         }
4940
4941         if (recoveryTarget == RECOVERY_TARGET_XID)
4942         {
4943                 /*
4944                  * There can be only one transaction end record with this exact
4945                  * transactionid
4946                  *
4947                  * when testing for an xid, we MUST test for equality only, since
4948                  * transactions are numbered in the order they start, not the order
4949                  * they complete. A higher numbered xid will complete before you about
4950                  * 50% of the time...
4951                  */
4952                 stopsHere = (record->xl_xid == recoveryTargetXid);
4953                 if (stopsHere)
4954                         *includeThis = recoveryTargetInclusive;
4955         }
4956         else if (recoveryTarget == RECOVERY_TARGET_NAME)
4957         {
4958                 /*
4959                  * There can be many restore points that share the same name, so we
4960                  * stop at the first one
4961                  */
4962                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
4963
4964                 /*
4965                  * Ignore recoveryTargetInclusive because this is not a transaction
4966                  * record
4967                  */
4968                 *includeThis = false;
4969         }
4970         else
4971         {
4972                 /*
4973                  * There can be many transactions that share the same commit time, so
4974                  * we stop after the last one, if we are inclusive, or stop at the
4975                  * first one if we are exclusive
4976                  */
4977                 if (recoveryTargetInclusive)
4978                         stopsHere = (recordXtime > recoveryTargetTime);
4979                 else
4980                         stopsHere = (recordXtime >= recoveryTargetTime);
4981                 if (stopsHere)
4982                         *includeThis = false;
4983         }
4984
4985         if (stopsHere)
4986         {
4987                 recoveryStopXid = record->xl_xid;
4988                 recoveryStopTime = recordXtime;
4989                 recoveryStopAfter = *includeThis;
4990
4991                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
4992                 {
4993                         if (recoveryStopAfter)
4994                                 ereport(LOG,
4995                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4996                                                                 recoveryStopXid,
4997                                                                 timestamptz_to_str(recoveryStopTime))));
4998                         else
4999                                 ereport(LOG,
5000                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5001                                                                 recoveryStopXid,
5002                                                                 timestamptz_to_str(recoveryStopTime))));
5003                 }
5004                 else if (record_info == XLOG_XACT_ABORT)
5005                 {
5006                         if (recoveryStopAfter)
5007                                 ereport(LOG,
5008                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5009                                                                 recoveryStopXid,
5010                                                                 timestamptz_to_str(recoveryStopTime))));
5011                         else
5012                                 ereport(LOG,
5013                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5014                                                                 recoveryStopXid,
5015                                                                 timestamptz_to_str(recoveryStopTime))));
5016                 }
5017                 else
5018                 {
5019                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5020
5021                         ereport(LOG,
5022                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5023                                                 recoveryStopName,
5024                                                 timestamptz_to_str(recoveryStopTime))));
5025                 }
5026
5027                 /*
5028                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5029                  * restore point since they are timestamped, though the latest
5030                  * transaction time is not updated.
5031                  */
5032                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5033                         SetLatestXTime(recordXtime);
5034         }
5035         else if (record->xl_rmid == RM_XACT_ID)
5036                 SetLatestXTime(recordXtime);
5037
5038         return stopsHere;
5039 }
5040
5041 /*
5042  * Recheck shared recoveryPause by polling.
5043  *
5044  * XXX Can also be done with shared latch.
5045  */
5046 static void
5047 recoveryPausesHere(void)
5048 {
5049         ereport(LOG,
5050                         (errmsg("recovery has paused"),
5051                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5052
5053         while (RecoveryIsPaused())
5054         {
5055                 pg_usleep(1000000L);    /* 1000 ms */
5056                 HandleStartupProcInterrupts();
5057         }
5058 }
5059
5060 bool
5061 RecoveryIsPaused(void)
5062 {
5063         /* use volatile pointer to prevent code rearrangement */
5064         volatile XLogCtlData *xlogctl = XLogCtl;
5065         bool            recoveryPause;
5066
5067         SpinLockAcquire(&xlogctl->info_lck);
5068         recoveryPause = xlogctl->recoveryPause;
5069         SpinLockRelease(&xlogctl->info_lck);
5070
5071         return recoveryPause;
5072 }
5073
5074 void
5075 SetRecoveryPause(bool recoveryPause)
5076 {
5077         /* use volatile pointer to prevent code rearrangement */
5078         volatile XLogCtlData *xlogctl = XLogCtl;
5079
5080         SpinLockAcquire(&xlogctl->info_lck);
5081         xlogctl->recoveryPause = recoveryPause;
5082         SpinLockRelease(&xlogctl->info_lck);
5083 }
5084
5085 /*
5086  * Save timestamp of latest processed commit/abort record.
5087  *
5088  * We keep this in XLogCtl, not a simple static variable, so that it can be
5089  * seen by processes other than the startup process.  Note in particular
5090  * that CreateRestartPoint is executed in the checkpointer.
5091  */
5092 static void
5093 SetLatestXTime(TimestampTz xtime)
5094 {
5095         /* use volatile pointer to prevent code rearrangement */
5096         volatile XLogCtlData *xlogctl = XLogCtl;
5097
5098         SpinLockAcquire(&xlogctl->info_lck);
5099         xlogctl->recoveryLastXTime = xtime;
5100         SpinLockRelease(&xlogctl->info_lck);
5101 }
5102
5103 /*
5104  * Fetch timestamp of latest processed commit/abort record.
5105  */
5106 TimestampTz
5107 GetLatestXTime(void)
5108 {
5109         /* use volatile pointer to prevent code rearrangement */
5110         volatile XLogCtlData *xlogctl = XLogCtl;
5111         TimestampTz xtime;
5112
5113         SpinLockAcquire(&xlogctl->info_lck);
5114         xtime = xlogctl->recoveryLastXTime;
5115         SpinLockRelease(&xlogctl->info_lck);
5116
5117         return xtime;
5118 }
5119
5120 /*
5121  * Save timestamp of the next chunk of WAL records to apply.
5122  *
5123  * We keep this in XLogCtl, not a simple static variable, so that it can be
5124  * seen by all backends.
5125  */
5126 static void
5127 SetCurrentChunkStartTime(TimestampTz xtime)
5128 {
5129         /* use volatile pointer to prevent code rearrangement */
5130         volatile XLogCtlData *xlogctl = XLogCtl;
5131
5132         SpinLockAcquire(&xlogctl->info_lck);
5133         xlogctl->currentChunkStartTime = xtime;
5134         SpinLockRelease(&xlogctl->info_lck);
5135 }
5136
5137 /*
5138  * Fetch timestamp of latest processed commit/abort record.
5139  * Startup process maintains an accurate local copy in XLogReceiptTime
5140  */
5141 TimestampTz
5142 GetCurrentChunkReplayStartTime(void)
5143 {
5144         /* use volatile pointer to prevent code rearrangement */
5145         volatile XLogCtlData *xlogctl = XLogCtl;
5146         TimestampTz xtime;
5147
5148         SpinLockAcquire(&xlogctl->info_lck);
5149         xtime = xlogctl->currentChunkStartTime;
5150         SpinLockRelease(&xlogctl->info_lck);
5151
5152         return xtime;
5153 }
5154
5155 /*
5156  * Returns time of receipt of current chunk of XLOG data, as well as
5157  * whether it was received from streaming replication or from archives.
5158  */
5159 void
5160 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5161 {
5162         /*
5163          * This must be executed in the startup process, since we don't export the
5164          * relevant state to shared memory.
5165          */
5166         Assert(InRecovery);
5167
5168         *rtime = XLogReceiptTime;
5169         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5170 }
5171
5172 /*
5173  * Note that text field supplied is a parameter name and does not require
5174  * translation
5175  */
5176 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5177 do { \
5178         if ((currValue) < (minValue)) \
5179                 ereport(ERROR, \
5180                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5181                                  errmsg("hot standby is not possible because " \
5182                                                 "%s = %d is a lower setting than on the master server " \
5183                                                 "(its value was %d)", \
5184                                                 param_name, \
5185                                                 currValue, \
5186                                                 minValue))); \
5187 } while(0)
5188
5189 /*
5190  * Check to see if required parameters are set high enough on this server
5191  * for various aspects of recovery operation.
5192  */
5193 static void
5194 CheckRequiredParameterValues(void)
5195 {
5196         /*
5197          * For archive recovery, the WAL must be generated with at least 'archive'
5198          * wal_level.
5199          */
5200         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5201         {
5202                 ereport(WARNING,
5203                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5204                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5205         }
5206
5207         /*
5208          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5209          * we must have at least as many backend slots as the primary.
5210          */
5211         if (InArchiveRecovery && EnableHotStandby)
5212         {
5213                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5214                         ereport(ERROR,
5215                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5216                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5217
5218                 /* We ignore autovacuum_max_workers when we make this test. */
5219                 RecoveryRequiresIntParameter("max_connections",
5220                                                                          MaxConnections,
5221                                                                          ControlFile->MaxConnections);
5222                 RecoveryRequiresIntParameter("max_prepared_transactions",
5223                                                                          max_prepared_xacts,
5224                                                                          ControlFile->max_prepared_xacts);
5225                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5226                                                                          max_locks_per_xact,
5227                                                                          ControlFile->max_locks_per_xact);
5228         }
5229 }
5230
5231 /*
5232  * This must be called ONCE during postmaster or standalone-backend startup
5233  */
5234 void
5235 StartupXLOG(void)
5236 {
5237         XLogCtlInsert *Insert;
5238         CheckPoint      checkPoint;
5239         bool            wasShutdown;
5240         bool            reachedStopPoint = false;
5241         bool            haveBackupLabel = false;
5242         XLogRecPtr      RecPtr,
5243                                 checkPointLoc,
5244                                 EndOfLog;
5245         XLogSegNo       endLogSegNo;
5246         XLogRecord *record;
5247         uint32          freespace;
5248         TransactionId oldestActiveXID;
5249         bool            backupEndRequired = false;
5250         bool            backupFromStandby = false;
5251         DBState         dbstate_at_startup;
5252
5253         /*
5254          * Read control file and check XLOG status looks valid.
5255          *
5256          * Note: in most control paths, *ControlFile is already valid and we need
5257          * not do ReadControlFile() here, but might as well do it to be sure.
5258          */
5259         ReadControlFile();
5260
5261         if (ControlFile->state < DB_SHUTDOWNED ||
5262                 ControlFile->state > DB_IN_PRODUCTION ||
5263                 !XRecOffIsValid(ControlFile->checkPoint))
5264                 ereport(FATAL,
5265                                 (errmsg("control file contains invalid data")));
5266
5267         if (ControlFile->state == DB_SHUTDOWNED)
5268                 ereport(LOG,
5269                                 (errmsg("database system was shut down at %s",
5270                                                 str_time(ControlFile->time))));
5271         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
5272                 ereport(LOG,
5273                                 (errmsg("database system was shut down in recovery at %s",
5274                                                 str_time(ControlFile->time))));
5275         else if (ControlFile->state == DB_SHUTDOWNING)
5276                 ereport(LOG,
5277                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5278                                                 str_time(ControlFile->time))));
5279         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5280                 ereport(LOG,
5281                    (errmsg("database system was interrupted while in recovery at %s",
5282                                    str_time(ControlFile->time)),
5283                         errhint("This probably means that some data is corrupted and"
5284                                         " you will have to use the last backup for recovery.")));
5285         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5286                 ereport(LOG,
5287                                 (errmsg("database system was interrupted while in recovery at log time %s",
5288                                                 str_time(ControlFile->checkPointCopy.time)),
5289                                  errhint("If this has occurred more than once some data might be corrupted"
5290                           " and you might need to choose an earlier recovery target.")));
5291         else if (ControlFile->state == DB_IN_PRODUCTION)
5292                 ereport(LOG,
5293                           (errmsg("database system was interrupted; last known up at %s",
5294                                           str_time(ControlFile->time))));
5295
5296         /* This is just to allow attaching to startup process with a debugger */
5297 #ifdef XLOG_REPLAY_DELAY
5298         if (ControlFile->state != DB_SHUTDOWNED)
5299                 pg_usleep(60000000L);
5300 #endif
5301
5302         /*
5303          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5304          * someone has performed a copy for PITR, these directories may have been
5305          * excluded and need to be re-created.
5306          */
5307         ValidateXLOGDirectoryStructure();
5308
5309         /*
5310          * Clear out any old relcache cache files.      This is *necessary* if we do
5311          * any WAL replay, since that would probably result in the cache files
5312          * being out of sync with database reality.  In theory we could leave them
5313          * in place if the database had been cleanly shut down, but it seems
5314          * safest to just remove them always and let them be rebuilt during the
5315          * first backend startup.
5316          */
5317         RelationCacheInitFileRemove();
5318
5319         /*
5320          * Initialize on the assumption we want to recover to the same timeline
5321          * that's active according to pg_control.
5322          */
5323         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5324
5325         /*
5326          * Check for recovery control file, and if so set up state for offline
5327          * recovery
5328          */
5329         readRecoveryCommandFile();
5330
5331         /* Now we can determine the list of expected TLIs */
5332         expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
5333
5334         /*
5335          * If the location of the checkpoint record is not on the expected
5336          * timeline in the history of the requested timeline, we cannot proceed:
5337          * the backup is not part of the history of the requested timeline.
5338          */
5339         if (tliOfPointInHistory(ControlFile->checkPoint, expectedTLEs) !=
5340                         ControlFile->checkPointCopy.ThisTimeLineID)
5341         {
5342                 XLogRecPtr switchpoint;
5343
5344                 /*
5345                  * tliSwitchPoint will throw an error if the checkpoint's timeline
5346                  * is not in expectedTLEs at all.
5347                  */
5348                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs);
5349                 ereport(FATAL,
5350                                 (errmsg("requested timeline %u is not a child of this server's history",
5351                                                 recoveryTargetTLI),
5352                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
5353                                                    (uint32) (ControlFile->checkPoint >> 32),
5354                                                    (uint32) ControlFile->checkPoint,
5355                                                    ControlFile->checkPointCopy.ThisTimeLineID,
5356                                                    (uint32) (switchpoint >> 32),
5357                                                    (uint32) switchpoint)));
5358         }
5359
5360         /*
5361          * The min recovery point should be part of the requested timeline's
5362          * history, too.
5363          */
5364         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
5365                 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
5366                         ControlFile->minRecoveryPointTLI)
5367                 ereport(FATAL,
5368                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
5369                                                 recoveryTargetTLI,
5370                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
5371                                                 (uint32) ControlFile->minRecoveryPoint,
5372                                                 ControlFile->minRecoveryPointTLI)));
5373
5374         /*
5375          * Save the selected recovery target timeline ID and
5376          * archive_cleanup_command in shared memory so that other processes can
5377          * see them
5378          */
5379         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
5380         strncpy(XLogCtl->archiveCleanupCommand,
5381                         archiveCleanupCommand ? archiveCleanupCommand : "",
5382                         sizeof(XLogCtl->archiveCleanupCommand));
5383
5384         if (InArchiveRecovery)
5385         {
5386                 if (StandbyMode)
5387                         ereport(LOG,
5388                                         (errmsg("entering standby mode")));
5389                 else if (recoveryTarget == RECOVERY_TARGET_XID)
5390                         ereport(LOG,
5391                                         (errmsg("starting point-in-time recovery to XID %u",
5392                                                         recoveryTargetXid)));
5393                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5394                         ereport(LOG,
5395                                         (errmsg("starting point-in-time recovery to %s",
5396                                                         timestamptz_to_str(recoveryTargetTime))));
5397                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
5398                         ereport(LOG,
5399                                         (errmsg("starting point-in-time recovery to \"%s\"",
5400                                                         recoveryTargetName)));
5401                 else
5402                         ereport(LOG,
5403                                         (errmsg("starting archive recovery")));
5404         }
5405
5406         /*
5407          * Take ownership of the wakeup latch if we're going to sleep during
5408          * recovery.
5409          */
5410         if (StandbyMode)
5411                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
5412
5413         if (read_backup_label(&checkPointLoc, &backupEndRequired,
5414                                                   &backupFromStandby))
5415         {
5416                 /*
5417                  * When a backup_label file is present, we want to roll forward from
5418                  * the checkpoint it identifies, rather than using pg_control.
5419                  */
5420                 record = ReadCheckpointRecord(checkPointLoc, 0);
5421                 if (record != NULL)
5422                 {
5423                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5424                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5425                         ereport(DEBUG1,
5426                                         (errmsg("checkpoint record is at %X/%X",
5427                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5428                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5429
5430                         /*
5431                          * Make sure that REDO location exists. This may not be the case
5432                          * if there was a crash during an online backup, which left a
5433                          * backup_label around that references a WAL segment that's
5434                          * already been archived.
5435                          */
5436                         if (XLByteLT(checkPoint.redo, checkPointLoc))
5437                         {
5438                                 if (!ReadRecord(&(checkPoint.redo), LOG, false))
5439                                         ereport(FATAL,
5440                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5441                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5442                         }
5443                 }
5444                 else
5445                 {
5446                         ereport(FATAL,
5447                                         (errmsg("could not locate required checkpoint record"),
5448                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5449                         wasShutdown = false;    /* keep compiler quiet */
5450                 }
5451                 /* set flag to delete it later */
5452                 haveBackupLabel = true;
5453         }
5454         else
5455         {
5456                 /*
5457                  * Get the last valid checkpoint record.  If the latest one according
5458                  * to pg_control is broken, try the next-to-last one.
5459                  */
5460                 checkPointLoc = ControlFile->checkPoint;
5461                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5462                 record = ReadCheckpointRecord(checkPointLoc, 1);
5463                 if (record != NULL)
5464                 {
5465                         ereport(DEBUG1,
5466                                         (errmsg("checkpoint record is at %X/%X",
5467                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5468                 }
5469                 else if (StandbyMode)
5470                 {
5471                         /*
5472                          * The last valid checkpoint record required for a streaming
5473                          * recovery exists in neither standby nor the primary.
5474                          */
5475                         ereport(PANIC,
5476                                         (errmsg("could not locate a valid checkpoint record")));
5477                 }
5478                 else
5479                 {
5480                         checkPointLoc = ControlFile->prevCheckPoint;
5481                         record = ReadCheckpointRecord(checkPointLoc, 2);
5482                         if (record != NULL)
5483                         {
5484                                 ereport(LOG,
5485                                                 (errmsg("using previous checkpoint record at %X/%X",
5486                                                                 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5487                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5488                         }
5489                         else
5490                                 ereport(PANIC,
5491                                          (errmsg("could not locate a valid checkpoint record")));
5492                 }
5493                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5494                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5495         }
5496
5497         LastRec = RecPtr = checkPointLoc;
5498
5499         ereport(DEBUG1,
5500                         (errmsg("redo record is at %X/%X; shutdown %s",
5501                                         (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
5502                                         wasShutdown ? "TRUE" : "FALSE")));
5503         ereport(DEBUG1,
5504                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5505                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5506                                         checkPoint.nextOid)));
5507         ereport(DEBUG1,
5508                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5509                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5510         ereport(DEBUG1,
5511                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5512                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5513         if (!TransactionIdIsNormal(checkPoint.nextXid))
5514                 ereport(PANIC,
5515                                 (errmsg("invalid next transaction ID")));
5516
5517         /* initialize shared memory variables from the checkpoint record */
5518         ShmemVariableCache->nextXid = checkPoint.nextXid;
5519         ShmemVariableCache->nextOid = checkPoint.nextOid;
5520         ShmemVariableCache->oidCount = 0;
5521         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5522         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5523         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
5524         XLogCtl->ckptXid = checkPoint.nextXid;
5525
5526         /*
5527          * We must replay WAL entries using the same TimeLineID they were created
5528          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5529          * also xlog_redo()).
5530          */
5531         ThisTimeLineID = checkPoint.ThisTimeLineID;
5532
5533         lastFullPageWrites = checkPoint.fullPageWrites;
5534
5535         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5536
5537         if (XLByteLT(RecPtr, checkPoint.redo))
5538                 ereport(PANIC,
5539                                 (errmsg("invalid redo in checkpoint record")));
5540
5541         /*
5542          * Check whether we need to force recovery from WAL.  If it appears to
5543          * have been a clean shutdown and we did not have a recovery.conf file,
5544          * then assume no recovery needed.
5545          */
5546         if (XLByteLT(checkPoint.redo, RecPtr))
5547         {
5548                 if (wasShutdown)
5549                         ereport(PANIC,
5550                                         (errmsg("invalid redo record in shutdown checkpoint")));
5551                 InRecovery = true;
5552         }
5553         else if (ControlFile->state != DB_SHUTDOWNED)
5554                 InRecovery = true;
5555         else if (InArchiveRecovery)
5556         {
5557                 /* force recovery due to presence of recovery.conf */
5558                 InRecovery = true;
5559         }
5560
5561         /* REDO */
5562         if (InRecovery)
5563         {
5564                 int                     rmid;
5565
5566                 /* use volatile pointer to prevent code rearrangement */
5567                 volatile XLogCtlData *xlogctl = XLogCtl;
5568
5569                 /*
5570                  * Update pg_control to show that we are recovering and to show the
5571                  * selected checkpoint as the place we are starting from. We also mark
5572                  * pg_control with any minimum recovery stop point obtained from a
5573                  * backup history file.
5574                  */
5575                 dbstate_at_startup = ControlFile->state;
5576                 if (InArchiveRecovery)
5577                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5578                 else
5579                 {
5580                         ereport(LOG,
5581                                         (errmsg("database system was not properly shut down; "
5582                                                         "automatic recovery in progress")));
5583                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5584                 }
5585                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5586                 ControlFile->checkPoint = checkPointLoc;
5587                 ControlFile->checkPointCopy = checkPoint;
5588                 if (InArchiveRecovery)
5589                 {
5590                         /* initialize minRecoveryPoint if not set yet */
5591                         if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
5592                         {
5593                                 ControlFile->minRecoveryPoint = checkPoint.redo;
5594                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
5595                         }
5596                 }
5597
5598                 /*
5599                  * Set backupStartPoint if we're starting recovery from a base backup.
5600                  *
5601                  * Set backupEndPoint and use minRecoveryPoint as the backup end
5602                  * location if we're starting recovery from a base backup which was
5603                  * taken from the standby. In this case, the database system status in
5604                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
5605                  * means that backup is corrupted, so we cancel recovery.
5606                  */
5607                 if (haveBackupLabel)
5608                 {
5609                         ControlFile->backupStartPoint = checkPoint.redo;
5610                         ControlFile->backupEndRequired = backupEndRequired;
5611
5612                         if (backupFromStandby)
5613                         {
5614                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
5615                                         ereport(FATAL,
5616                                                         (errmsg("backup_label contains data inconsistent with control file"),
5617                                                          errhint("This means that the backup is corrupted and you will "
5618                                                            "have to use another backup for recovery.")));
5619                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
5620                         }
5621                 }
5622                 ControlFile->time = (pg_time_t) time(NULL);
5623                 /* No need to hold ControlFileLock yet, we aren't up far enough */
5624                 UpdateControlFile();
5625
5626                 /* initialize our local copy of minRecoveryPoint */
5627                 minRecoveryPoint = ControlFile->minRecoveryPoint;
5628                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5629
5630                 /*
5631                  * Reset pgstat data, because it may be invalid after recovery.
5632                  */
5633                 pgstat_reset_all();
5634
5635                 /*
5636                  * If there was a backup label file, it's done its job and the info
5637                  * has now been propagated into pg_control.  We must get rid of the
5638                  * label file so that if we crash during recovery, we'll pick up at
5639                  * the latest recovery restartpoint instead of going all the way back
5640                  * to the backup start point.  It seems prudent though to just rename
5641                  * the file out of the way rather than delete it completely.
5642                  */
5643                 if (haveBackupLabel)
5644                 {
5645                         unlink(BACKUP_LABEL_OLD);
5646                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5647                                 ereport(FATAL,
5648                                                 (errcode_for_file_access(),
5649                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5650                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5651                 }
5652
5653                 /* Check that the GUCs used to generate the WAL allow recovery */
5654                 CheckRequiredParameterValues();
5655
5656                 /*
5657                  * We're in recovery, so unlogged relations may be trashed and must be
5658                  * reset.  This should be done BEFORE allowing Hot Standby
5659                  * connections, so that read-only backends don't try to read whatever
5660                  * garbage is left over from before.
5661                  */
5662                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5663
5664                 /*
5665                  * Likewise, delete any saved transaction snapshot files that got left
5666                  * behind by crashed backends.
5667                  */
5668                 DeleteAllExportedSnapshotFiles();
5669
5670                 /*
5671                  * Initialize for Hot Standby, if enabled. We won't let backends in
5672                  * yet, not until we've reached the min recovery point specified in
5673                  * control file and we've established a recovery snapshot from a
5674                  * running-xacts WAL record.
5675                  */
5676                 if (InArchiveRecovery && EnableHotStandby)
5677                 {
5678                         TransactionId *xids;
5679                         int                     nxids;
5680
5681                         ereport(DEBUG1,
5682                                         (errmsg("initializing for hot standby")));
5683
5684                         InitRecoveryTransactionEnvironment();
5685
5686                         if (wasShutdown)
5687                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5688                         else
5689                                 oldestActiveXID = checkPoint.oldestActiveXid;
5690                         Assert(TransactionIdIsValid(oldestActiveXID));
5691
5692                         /*
5693                          * Startup commit log and subtrans only. Other SLRUs are not
5694                          * maintained during recovery and need not be started yet.
5695                          */
5696                         StartupCLOG();
5697                         StartupSUBTRANS(oldestActiveXID);
5698
5699                         /*
5700                          * If we're beginning at a shutdown checkpoint, we know that
5701                          * nothing was running on the master at this point. So fake-up an
5702                          * empty running-xacts record and use that here and now. Recover
5703                          * additional standby state for prepared transactions.
5704                          */
5705                         if (wasShutdown)
5706                         {
5707                                 RunningTransactionsData running;
5708                                 TransactionId latestCompletedXid;
5709
5710                                 /*
5711                                  * Construct a RunningTransactions snapshot representing a
5712                                  * shut down server, with only prepared transactions still
5713                                  * alive. We're never overflowed at this point because all
5714                                  * subxids are listed with their parent prepared transactions.
5715                                  */
5716                                 running.xcnt = nxids;
5717                                 running.subxcnt = 0;
5718                                 running.subxid_overflow = false;
5719                                 running.nextXid = checkPoint.nextXid;
5720                                 running.oldestRunningXid = oldestActiveXID;
5721                                 latestCompletedXid = checkPoint.nextXid;
5722                                 TransactionIdRetreat(latestCompletedXid);
5723                                 Assert(TransactionIdIsNormal(latestCompletedXid));
5724                                 running.latestCompletedXid = latestCompletedXid;
5725                                 running.xids = xids;
5726
5727                                 ProcArrayApplyRecoveryInfo(&running);
5728
5729                                 StandbyRecoverPreparedTransactions(false);
5730                         }
5731                 }
5732
5733                 /* Initialize resource managers */
5734                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5735                 {
5736                         if (RmgrTable[rmid].rm_startup != NULL)
5737                                 RmgrTable[rmid].rm_startup();
5738                 }
5739
5740                 /*
5741                  * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
5742                  * recoveryLastXTime.
5743                  *
5744                  * This is slightly confusing if we're starting from an online
5745                  * checkpoint; we've just read and replayed the chekpoint record, but
5746                  * we're going to start replay from its redo pointer, which precedes
5747                  * the location of the checkpoint record itself. So even though the
5748                  * last record we've replayed is indeed ReadRecPtr, we haven't
5749                  * replayed all the preceding records yet. That's OK for the current
5750                  * use of these variables.
5751                  */
5752                 SpinLockAcquire(&xlogctl->info_lck);
5753                 xlogctl->replayEndRecPtr = ReadRecPtr;
5754                 xlogctl->replayEndTLI = ThisTimeLineID;
5755                 xlogctl->recoveryLastRecPtr = EndRecPtr;
5756                 xlogctl->recoveryLastXTime = 0;
5757                 xlogctl->currentChunkStartTime = 0;
5758                 xlogctl->recoveryPause = false;
5759                 SpinLockRelease(&xlogctl->info_lck);
5760
5761                 /* Also ensure XLogReceiptTime has a sane value */
5762                 XLogReceiptTime = GetCurrentTimestamp();
5763
5764                 /*
5765                  * Let postmaster know we've started redo now, so that it can launch
5766                  * checkpointer to perform restartpoints.  We don't bother during
5767                  * crash recovery as restartpoints can only be performed during
5768                  * archive recovery.  And we'd like to keep crash recovery simple, to
5769                  * avoid introducing bugs that could affect you when recovering after
5770                  * crash.
5771                  *
5772                  * After this point, we can no longer assume that we're the only
5773                  * process in addition to postmaster!  Also, fsync requests are
5774                  * subsequently to be handled by the checkpointer, not locally.
5775                  */
5776                 if (InArchiveRecovery && IsUnderPostmaster)
5777                 {
5778                         PublishStartupProcessInformation();
5779                         SetForwardFsyncRequests();
5780                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5781                         bgwriterLaunched = true;
5782                 }
5783
5784                 /*
5785                  * Allow read-only connections immediately if we're consistent
5786                  * already.
5787                  */
5788                 CheckRecoveryConsistency();
5789
5790                 /*
5791                  * Find the first record that logically follows the checkpoint --- it
5792                  * might physically precede it, though.
5793                  */
5794                 if (XLByteLT(checkPoint.redo, RecPtr))
5795                 {
5796                         /* back up to find the record */
5797                         record = ReadRecord(&(checkPoint.redo), PANIC, false);
5798                 }
5799                 else
5800                 {
5801                         /* just have to read next record after CheckPoint */
5802                         record = ReadRecord(NULL, LOG, false);
5803                 }
5804
5805                 if (record != NULL)
5806                 {
5807                         bool            recoveryContinue = true;
5808                         bool            recoveryApply = true;
5809                         bool            recoveryPause = false;
5810                         ErrorContextCallback errcallback;
5811                         TimestampTz xtime;
5812
5813                         InRedo = true;
5814
5815                         ereport(LOG,
5816                                         (errmsg("redo starts at %X/%X",
5817                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5818
5819                         /*
5820                          * main redo apply loop
5821                          */
5822                         do
5823                         {
5824 #ifdef WAL_DEBUG
5825                                 if (XLOG_DEBUG ||
5826                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
5827                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
5828                                 {
5829                                         StringInfoData buf;
5830
5831                                         initStringInfo(&buf);
5832                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5833                                                                          (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
5834                                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
5835                                         xlog_outrec(&buf, record);
5836                                         appendStringInfo(&buf, " - ");
5837                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5838                                                                                                            record->xl_info,
5839                                                                                                          XLogRecGetData(record));
5840                                         elog(LOG, "%s", buf.data);
5841                                         pfree(buf.data);
5842                                 }
5843 #endif
5844
5845                                 /* Handle interrupt signals of startup process */
5846                                 HandleStartupProcInterrupts();
5847
5848                                 /* Allow read-only connections if we're consistent now */
5849                                 CheckRecoveryConsistency();
5850
5851                                 /*
5852                                  * Have we reached our recovery target?
5853                                  */
5854                                 if (recoveryStopsHere(record, &recoveryApply))
5855                                 {
5856                                         /*
5857                                          * Pause only if users can connect to send a resume
5858                                          * message
5859                                          */
5860                                         if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY)
5861                                         {
5862                                                 SetRecoveryPause(true);
5863                                                 recoveryPausesHere();
5864                                         }
5865                                         reachedStopPoint = true;        /* see below */
5866                                         recoveryContinue = false;
5867                                         if (!recoveryApply)
5868                                                 break;
5869                                 }
5870
5871                                 /* Setup error traceback support for ereport() */
5872                                 errcallback.callback = rm_redo_error_callback;
5873                                 errcallback.arg = (void *) record;
5874                                 errcallback.previous = error_context_stack;
5875                                 error_context_stack = &errcallback;
5876
5877                                 /*
5878                                  * ShmemVariableCache->nextXid must be beyond record's xid.
5879                                  *
5880                                  * We don't expect anyone else to modify nextXid, hence we
5881                                  * don't need to hold a lock while examining it.  We still
5882                                  * acquire the lock to modify it, though.
5883                                  */
5884                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5885                                                                                                  ShmemVariableCache->nextXid))
5886                                 {
5887                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
5888                                         ShmemVariableCache->nextXid = record->xl_xid;
5889                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5890                                         LWLockRelease(XidGenLock);
5891                                 }
5892
5893                                 /*
5894                                  * Update shared replayEndRecPtr before replaying this record,
5895                                  * so that XLogFlush will update minRecoveryPoint correctly.
5896                                  */
5897                                 SpinLockAcquire(&xlogctl->info_lck);
5898                                 xlogctl->replayEndRecPtr = EndRecPtr;
5899                                 recoveryPause = xlogctl->recoveryPause;
5900                                 SpinLockRelease(&xlogctl->info_lck);
5901
5902                                 /*
5903                                  * Pause only if users can connect to send a resume message
5904                                  */
5905                                 if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY)
5906                                         recoveryPausesHere();
5907
5908                                 /*
5909                                  * If we are attempting to enter Hot Standby mode, process
5910                                  * XIDs we see
5911                                  */
5912                                 if (standbyState >= STANDBY_INITIALIZED &&
5913                                         TransactionIdIsValid(record->xl_xid))
5914                                         RecordKnownAssignedTransactionIds(record->xl_xid);
5915
5916                                 /* Now apply the WAL record itself */
5917                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5918
5919                                 /* Pop the error context stack */
5920                                 error_context_stack = errcallback.previous;
5921
5922                                 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
5923                                         XLByteLE(ControlFile->backupEndPoint, EndRecPtr))
5924                                 {
5925                                         /*
5926                                          * We have reached the end of base backup, the point where
5927                                          * the minimum recovery point in pg_control indicates. The
5928                                          * data on disk is now consistent. Reset backupStartPoint
5929                                          * and backupEndPoint.
5930                                          */
5931                                         elog(DEBUG1, "end of backup reached");
5932
5933                                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5934
5935                                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
5936                                         MemSet(&ControlFile->backupEndPoint, 0, sizeof(XLogRecPtr));
5937                                         ControlFile->backupEndRequired = false;
5938                                         UpdateControlFile();
5939
5940                                         LWLockRelease(ControlFileLock);
5941                                 }
5942
5943                                 /*
5944                                  * Update shared recoveryLastRecPtr after this record has been
5945                                  * replayed.
5946                                  */
5947                                 SpinLockAcquire(&xlogctl->info_lck);
5948                                 xlogctl->recoveryLastRecPtr = EndRecPtr;
5949                                 SpinLockRelease(&xlogctl->info_lck);
5950
5951                                 LastRec = ReadRecPtr;
5952
5953                                 record = ReadRecord(NULL, LOG, false);
5954                         } while (record != NULL && recoveryContinue);
5955
5956                         /*
5957                          * end of main redo apply loop
5958                          */
5959
5960                         ereport(LOG,
5961                                         (errmsg("redo done at %X/%X",
5962                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5963                         xtime = GetLatestXTime();
5964                         if (xtime)
5965                                 ereport(LOG,
5966                                          (errmsg("last completed transaction was at log time %s",
5967                                                          timestamptz_to_str(xtime))));
5968                         InRedo = false;
5969                 }
5970                 else
5971                 {
5972                         /* there are no WAL records following the checkpoint */
5973                         ereport(LOG,
5974                                         (errmsg("redo is not required")));
5975                 }
5976         }
5977
5978         /*
5979          * Kill WAL receiver, if it's still running, before we continue to write
5980          * the startup checkpoint record. It will trump over the checkpoint and
5981          * subsequent records if it's still alive when we start writing WAL.
5982          */
5983         ShutdownWalRcv();
5984
5985         /*
5986          * We don't need the latch anymore. It's not strictly necessary to disown
5987          * it, but let's do it for the sake of tidiness.
5988          */
5989         if (StandbyMode)
5990                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
5991
5992         /*
5993          * We are now done reading the xlog from stream. Turn off streaming
5994          * recovery to force fetching the files (which would be required at end of
5995          * recovery, e.g., timeline history file) from archive or pg_xlog.
5996          */
5997         StandbyMode = false;
5998
5999         /*
6000          * Re-fetch the last valid or last applied record, so we can identify the
6001          * exact endpoint of what we consider the valid portion of WAL.
6002          */
6003         record = ReadRecord(&LastRec, PANIC, false);
6004         EndOfLog = EndRecPtr;
6005         XLByteToPrevSeg(EndOfLog, endLogSegNo);
6006
6007         /*
6008          * Complain if we did not roll forward far enough to render the backup
6009          * dump consistent.  Note: it is indeed okay to look at the local variable
6010          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6011          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6012          * advanced beyond the WAL we processed.
6013          */
6014         if (InRecovery &&
6015                 (XLByteLT(EndOfLog, minRecoveryPoint) ||
6016                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6017         {
6018                 if (reachedStopPoint)
6019                 {
6020                         /* stopped because of stop request */
6021                         ereport(FATAL,
6022                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6023                 }
6024
6025                 /*
6026                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6027                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6028                  * tried to recover from an online backup but never called
6029                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6030                  * point. However, this also happens in crash recovery, if the system
6031                  * crashes while an online backup is in progress. We must not treat
6032                  * that as an error, or the database will refuse to start up.
6033                  */
6034                 if (InArchiveRecovery || ControlFile->backupEndRequired)
6035                 {
6036                         if (ControlFile->backupEndRequired)
6037                                 ereport(FATAL,
6038                                                 (errmsg("WAL ends before end of online backup"),
6039                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6040                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6041                                 ereport(FATAL,
6042                                                 (errmsg("WAL ends before end of online backup"),
6043                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6044                         else
6045                                 ereport(FATAL,
6046                                           (errmsg("WAL ends before consistent recovery point")));
6047                 }
6048         }
6049
6050         /*
6051          * Consider whether we need to assign a new timeline ID.
6052          *
6053          * If we are doing an archive recovery, we always assign a new ID.      This
6054          * handles a couple of issues.  If we stopped short of the end of WAL
6055          * during recovery, then we are clearly generating a new timeline and must
6056          * assign it a unique new ID.  Even if we ran to the end, modifying the
6057          * current last segment is problematic because it may result in trying to
6058          * overwrite an already-archived copy of that segment, and we encourage
6059          * DBAs to make their archive_commands reject that.  We can dodge the
6060          * problem by making the new active segment have a new timeline ID.
6061          *
6062          * In a normal crash recovery, we can just extend the timeline we were in.
6063          */
6064         if (InArchiveRecovery)
6065         {
6066                 char    reason[200];
6067
6068                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6069                 ereport(LOG,
6070                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6071
6072                 /*
6073                  * Create a comment for the history file to explain why and where
6074                  * timeline changed.
6075                  */
6076                 if (recoveryTarget == RECOVERY_TARGET_XID)
6077                         snprintf(reason, sizeof(reason),
6078                                          "%s transaction %u",
6079                                          recoveryStopAfter ? "after" : "before",
6080                                          recoveryStopXid);
6081                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6082                         snprintf(reason, sizeof(reason),
6083                                          "%s %s\n",
6084                                          recoveryStopAfter ? "after" : "before",
6085                                          timestamptz_to_str(recoveryStopTime));
6086                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6087                         snprintf(reason, sizeof(reason),
6088                                          "at restore point \"%s\"",
6089                                          recoveryStopName);
6090                 else
6091                         snprintf(reason, sizeof(reason), "no recovery target specified");
6092
6093                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6094                                                          EndRecPtr, reason);
6095         }
6096
6097         /* Save the selected TimeLineID in shared memory, too */
6098         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6099
6100         /*
6101          * We are now done reading the old WAL.  Turn off archive fetching if it
6102          * was active, and make a writable copy of the last WAL segment. (Note
6103          * that we also have a copy of the last block of the old WAL in readBuf;
6104          * we will use that below.)
6105          */
6106         if (InArchiveRecovery)
6107                 exitArchiveRecovery(curFileTLI, endLogSegNo);
6108
6109         /*
6110          * Prepare to write WAL starting at EndOfLog position, and init xlog
6111          * buffer cache using the block containing the last record from the
6112          * previous incarnation.
6113          */
6114         openLogSegNo = endLogSegNo;
6115         openLogFile = XLogFileOpen(openLogSegNo);
6116         openLogOff = 0;
6117         Insert = &XLogCtl->Insert;
6118         Insert->PrevRecord = LastRec;
6119         XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
6120
6121         /*
6122          * Tricky point here: readBuf contains the *last* block that the LastRec
6123          * record spans, not the one it starts in.      The last block is indeed the
6124          * one we want to use.
6125          */
6126         Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
6127         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6128         Insert->currpos = (char *) Insert->currpage +
6129                 (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);
6130
6131         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6132
6133         XLogCtl->LogwrtResult = LogwrtResult;
6134
6135         XLogCtl->LogwrtRqst.Write = EndOfLog;
6136         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6137
6138         freespace = INSERT_FREESPACE(Insert);
6139         if (freespace > 0)
6140         {
6141                 /* Make sure rest of page is zero */
6142                 MemSet(Insert->currpos, 0, freespace);
6143                 XLogCtl->Write.curridx = 0;
6144         }
6145         else
6146         {
6147                 /*
6148                  * Whenever LogwrtResult points to exactly the end of a page,
6149                  * Write.curridx must point to the *next* page (see XLogWrite()).
6150                  *
6151                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
6152                  * this is sufficient.  The first actual attempt to insert a log
6153                  * record will advance the insert state.
6154                  */
6155                 XLogCtl->Write.curridx = NextBufIdx(0);
6156         }
6157
6158         /* Pre-scan prepared transactions to find out the range of XIDs present */
6159         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6160
6161         /*
6162          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6163          * record before resource manager writes cleanup WAL records or checkpoint
6164          * record is written.
6165          */
6166         Insert->fullPageWrites = lastFullPageWrites;
6167         LocalSetXLogInsertAllowed();
6168         UpdateFullPageWrites();
6169         LocalXLogInsertAllowed = -1;
6170
6171         if (InRecovery)
6172         {
6173                 int                     rmid;
6174
6175                 /*
6176                  * Resource managers might need to write WAL records, eg, to record
6177                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
6178                  * this process only.
6179                  */
6180                 LocalSetXLogInsertAllowed();
6181
6182                 /*
6183                  * Allow resource managers to do any required cleanup.
6184                  */
6185                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6186                 {
6187                         if (RmgrTable[rmid].rm_cleanup != NULL)
6188                                 RmgrTable[rmid].rm_cleanup();
6189                 }
6190
6191                 /* Disallow XLogInsert again */
6192                 LocalXLogInsertAllowed = -1;
6193
6194                 /*
6195                  * Perform a checkpoint to update all our recovery activity to disk.
6196                  *
6197                  * Note that we write a shutdown checkpoint rather than an on-line
6198                  * one. This is not particularly critical, but since we may be
6199                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6200                  * the rule that TLI only changes in shutdown checkpoints, which
6201                  * allows some extra error checking in xlog_redo.
6202                  */
6203                 if (bgwriterLaunched)
6204                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6205                                                           CHECKPOINT_IMMEDIATE |
6206                                                           CHECKPOINT_WAIT);
6207                 else
6208                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6209
6210                 /*
6211                  * And finally, execute the recovery_end_command, if any.
6212                  */
6213                 if (recoveryEndCommand)
6214                         ExecuteRecoveryCommand(recoveryEndCommand,
6215                                                                    "recovery_end_command",
6216                                                                    true);
6217         }
6218
6219         /*
6220          * Preallocate additional log files, if wanted.
6221          */
6222         PreallocXlogFiles(EndOfLog);
6223
6224         /*
6225          * Reset initial contents of unlogged relations.  This has to be done
6226          * AFTER recovery is complete so that any unlogged relations created
6227          * during recovery also get picked up.
6228          */
6229         if (InRecovery)
6230                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6231
6232         /*
6233          * Okay, we're officially UP.
6234          */
6235         InRecovery = false;
6236
6237         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6238         ControlFile->state = DB_IN_PRODUCTION;
6239         ControlFile->time = (pg_time_t) time(NULL);
6240         UpdateControlFile();
6241         LWLockRelease(ControlFileLock);
6242
6243         /* start the archive_timeout timer running */
6244         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6245
6246         /* also initialize latestCompletedXid, to nextXid - 1 */
6247         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6248         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6249         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6250         LWLockRelease(ProcArrayLock);
6251
6252         /*
6253          * Start up the commit log and subtrans, if not already done for hot
6254          * standby.
6255          */
6256         if (standbyState == STANDBY_DISABLED)
6257         {
6258                 StartupCLOG();
6259                 StartupSUBTRANS(oldestActiveXID);
6260         }
6261
6262         /*
6263          * Perform end of recovery actions for any SLRUs that need it.
6264          */
6265         StartupMultiXact();
6266         TrimCLOG();
6267
6268         /* Reload shared-memory state for prepared transactions */
6269         RecoverPreparedTransactions();
6270
6271         /*
6272          * Shutdown the recovery environment. This must occur after
6273          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6274          */
6275         if (standbyState != STANDBY_DISABLED)
6276                 ShutdownRecoveryTransactionEnvironment();
6277
6278         /* Shut down readFile facility, free space */
6279         if (readFile >= 0)
6280         {
6281                 close(readFile);
6282                 readFile = -1;
6283         }
6284         if (readBuf)
6285         {
6286                 free(readBuf);
6287                 readBuf = NULL;
6288         }
6289         if (readRecordBuf)
6290         {
6291                 free(readRecordBuf);
6292                 readRecordBuf = NULL;
6293                 readRecordBufSize = 0;
6294         }
6295
6296         /*
6297          * If any of the critical GUCs have changed, log them before we allow
6298          * backends to write WAL.
6299          */
6300         LocalSetXLogInsertAllowed();
6301         XLogReportParameters();
6302
6303         /*
6304          * All done.  Allow backends to write WAL.      (Although the bool flag is
6305          * probably atomic in itself, we use the info_lck here to ensure that
6306          * there are no race conditions concerning visibility of other recent
6307          * updates to shared memory.)
6308          */
6309         {
6310                 /* use volatile pointer to prevent code rearrangement */
6311                 volatile XLogCtlData *xlogctl = XLogCtl;
6312
6313                 SpinLockAcquire(&xlogctl->info_lck);
6314                 xlogctl->SharedRecoveryInProgress = false;
6315                 SpinLockRelease(&xlogctl->info_lck);
6316         }
6317 }
6318
6319 /*
6320  * Checks if recovery has reached a consistent state. When consistency is
6321  * reached and we have a valid starting standby snapshot, tell postmaster
6322  * that it can start accepting read-only connections.
6323  */
6324 static void
6325 CheckRecoveryConsistency(void)
6326 {
6327         /*
6328          * During crash recovery, we don't reach a consistent state until we've
6329          * replayed all the WAL.
6330          */
6331         if (XLogRecPtrIsInvalid(minRecoveryPoint))
6332                 return;
6333
6334         /*
6335          * Have we passed our safe starting point? Note that minRecoveryPoint
6336          * is known to be incorrectly set if ControlFile->backupEndRequired,
6337          * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
6338          * minRecoveryPoint. All we prior to that is its not consistent yet.
6339          */
6340         if (!reachedConsistency && !ControlFile->backupEndRequired &&
6341                 XLByteLE(minRecoveryPoint, EndRecPtr) &&
6342                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6343         {
6344                 /*
6345                  * Check to see if the XLOG sequence contained any unresolved
6346                  * references to uninitialized pages.
6347                  */
6348                 XLogCheckInvalidPages();
6349
6350                 reachedConsistency = true;
6351                 ereport(LOG,
6352                                 (errmsg("consistent recovery state reached at %X/%X",
6353                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
6354         }
6355
6356         /*
6357          * Have we got a valid starting snapshot that will allow queries to be
6358          * run? If so, we can tell postmaster that the database is consistent now,
6359          * enabling connections.
6360          */
6361         if (standbyState == STANDBY_SNAPSHOT_READY &&
6362                 !LocalHotStandbyActive &&
6363                 reachedConsistency &&
6364                 IsUnderPostmaster)
6365         {
6366                 /* use volatile pointer to prevent code rearrangement */
6367                 volatile XLogCtlData *xlogctl = XLogCtl;
6368
6369                 SpinLockAcquire(&xlogctl->info_lck);
6370                 xlogctl->SharedHotStandbyActive = true;
6371                 SpinLockRelease(&xlogctl->info_lck);
6372
6373                 LocalHotStandbyActive = true;
6374
6375                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6376         }
6377 }
6378
6379 /*
6380  * Is the system still in recovery?
6381  *
6382  * Unlike testing InRecovery, this works in any process that's connected to
6383  * shared memory.
6384  *
6385  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
6386  * variables the first time we see that recovery is finished.
6387  */
6388 bool
6389 RecoveryInProgress(void)
6390 {
6391         /*
6392          * We check shared state each time only until we leave recovery mode. We
6393          * can't re-enter recovery, so there's no need to keep checking after the
6394          * shared variable has once been seen false.
6395          */
6396         if (!LocalRecoveryInProgress)
6397                 return false;
6398         else
6399         {
6400                 /* use volatile pointer to prevent code rearrangement */
6401                 volatile XLogCtlData *xlogctl = XLogCtl;
6402
6403                 /* spinlock is essential on machines with weak memory ordering! */
6404                 SpinLockAcquire(&xlogctl->info_lck);
6405                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6406                 SpinLockRelease(&xlogctl->info_lck);
6407
6408                 /*
6409                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6410                  * is finished. InitPostgres() relies upon this behaviour to ensure
6411                  * that InitXLOGAccess() is called at backend startup.  (If you change
6412                  * this, see also LocalSetXLogInsertAllowed.)
6413                  */
6414                 if (!LocalRecoveryInProgress)
6415                         InitXLOGAccess();
6416
6417                 return LocalRecoveryInProgress;
6418         }
6419 }
6420
6421 /*
6422  * Is HotStandby active yet? This is only important in special backends
6423  * since normal backends won't ever be able to connect until this returns
6424  * true. Postmaster knows this by way of signal, not via shared memory.
6425  *
6426  * Unlike testing standbyState, this works in any process that's connected to
6427  * shared memory.
6428  */
6429 bool
6430 HotStandbyActive(void)
6431 {
6432         /*
6433          * We check shared state each time only until Hot Standby is active. We
6434          * can't de-activate Hot Standby, so there's no need to keep checking
6435          * after the shared variable has once been seen true.
6436          */
6437         if (LocalHotStandbyActive)
6438                 return true;
6439         else
6440         {
6441                 /* use volatile pointer to prevent code rearrangement */
6442                 volatile XLogCtlData *xlogctl = XLogCtl;
6443
6444                 /* spinlock is essential on machines with weak memory ordering! */
6445                 SpinLockAcquire(&xlogctl->info_lck);
6446                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
6447                 SpinLockRelease(&xlogctl->info_lck);
6448
6449                 return LocalHotStandbyActive;
6450         }
6451 }
6452
6453 /*
6454  * Is this process allowed to insert new WAL records?
6455  *
6456  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6457  * But we also have provisions for forcing the result "true" or "false"
6458  * within specific processes regardless of the global state.
6459  */
6460 bool
6461 XLogInsertAllowed(void)
6462 {
6463         /*
6464          * If value is "unconditionally true" or "unconditionally false", just
6465          * return it.  This provides the normal fast path once recovery is known
6466          * done.
6467          */
6468         if (LocalXLogInsertAllowed >= 0)
6469                 return (bool) LocalXLogInsertAllowed;
6470
6471         /*
6472          * Else, must check to see if we're still in recovery.
6473          */
6474         if (RecoveryInProgress())
6475                 return false;
6476
6477         /*
6478          * On exit from recovery, reset to "unconditionally true", since there is
6479          * no need to keep checking.
6480          */
6481         LocalXLogInsertAllowed = 1;
6482         return true;
6483 }
6484
6485 /*
6486  * Make XLogInsertAllowed() return true in the current process only.
6487  *
6488  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6489  * and even call LocalSetXLogInsertAllowed() again after that.
6490  */
6491 static void
6492 LocalSetXLogInsertAllowed(void)
6493 {
6494         Assert(LocalXLogInsertAllowed == -1);
6495         LocalXLogInsertAllowed = 1;
6496
6497         /* Initialize as RecoveryInProgress() would do when switching state */
6498         InitXLOGAccess();
6499 }
6500
6501 /*
6502  * Subroutine to try to fetch and validate a prior checkpoint record.
6503  *
6504  * whichChkpt identifies the checkpoint (merely for reporting purposes).
6505  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6506  */
6507 static XLogRecord *
6508 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
6509 {
6510         XLogRecord *record;
6511
6512         if (!XRecOffIsValid(RecPtr))
6513         {
6514                 switch (whichChkpt)
6515                 {
6516                         case 1:
6517                                 ereport(LOG,
6518                                 (errmsg("invalid primary checkpoint link in control file")));
6519                                 break;
6520                         case 2:
6521                                 ereport(LOG,
6522                                                 (errmsg("invalid secondary checkpoint link in control file")));
6523                                 break;
6524                         default:
6525                                 ereport(LOG,
6526                                    (errmsg("invalid checkpoint link in backup_label file")));
6527                                 break;
6528                 }
6529                 return NULL;
6530         }
6531
6532         record = ReadRecord(&RecPtr, LOG, true);
6533
6534         if (record == NULL)
6535         {
6536                 switch (whichChkpt)
6537                 {
6538                         case 1:
6539                                 ereport(LOG,
6540                                                 (errmsg("invalid primary checkpoint record")));
6541                                 break;
6542                         case 2:
6543                                 ereport(LOG,
6544                                                 (errmsg("invalid secondary checkpoint record")));
6545                                 break;
6546                         default:
6547                                 ereport(LOG,
6548                                                 (errmsg("invalid checkpoint record")));
6549                                 break;
6550                 }
6551                 return NULL;
6552         }
6553         if (record->xl_rmid != RM_XLOG_ID)
6554         {
6555                 switch (whichChkpt)
6556                 {
6557                         case 1:
6558                                 ereport(LOG,
6559                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6560                                 break;
6561                         case 2:
6562                                 ereport(LOG,
6563                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6564                                 break;
6565                         default:
6566                                 ereport(LOG,
6567                                 (errmsg("invalid resource manager ID in checkpoint record")));
6568                                 break;
6569                 }
6570                 return NULL;
6571         }
6572         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6573                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6574         {
6575                 switch (whichChkpt)
6576                 {
6577                         case 1:
6578                                 ereport(LOG,
6579                                    (errmsg("invalid xl_info in primary checkpoint record")));
6580                                 break;
6581                         case 2:
6582                                 ereport(LOG,
6583                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6584                                 break;
6585                         default:
6586                                 ereport(LOG,
6587                                                 (errmsg("invalid xl_info in checkpoint record")));
6588                                 break;
6589                 }
6590                 return NULL;
6591         }
6592         if (record->xl_len != sizeof(CheckPoint) ||
6593                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6594         {
6595                 switch (whichChkpt)
6596                 {
6597                         case 1:
6598                                 ereport(LOG,
6599                                         (errmsg("invalid length of primary checkpoint record")));
6600                                 break;
6601                         case 2:
6602                                 ereport(LOG,
6603                                   (errmsg("invalid length of secondary checkpoint record")));
6604                                 break;
6605                         default:
6606                                 ereport(LOG,
6607                                                 (errmsg("invalid length of checkpoint record")));
6608                                 break;
6609                 }
6610                 return NULL;
6611         }
6612         return record;
6613 }
6614
6615 /*
6616  * This must be called during startup of a backend process, except that
6617  * it need not be called in a standalone backend (which does StartupXLOG
6618  * instead).  We need to initialize the local copies of ThisTimeLineID and
6619  * RedoRecPtr.
6620  *
6621  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6622  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6623  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6624  */
6625 void
6626 InitXLOGAccess(void)
6627 {
6628         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6629         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6630         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6631
6632         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6633         (void) GetRedoRecPtr();
6634 }
6635
6636 /*
6637  * Once spawned, a backend may update its local RedoRecPtr from
6638  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6639  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6640  */
6641 XLogRecPtr
6642 GetRedoRecPtr(void)
6643 {
6644         /* use volatile pointer to prevent code rearrangement */
6645         volatile XLogCtlData *xlogctl = XLogCtl;
6646
6647         SpinLockAcquire(&xlogctl->info_lck);
6648         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
6649         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6650         SpinLockRelease(&xlogctl->info_lck);
6651
6652         return RedoRecPtr;
6653 }
6654
6655 /*
6656  * GetInsertRecPtr -- Returns the current insert position.
6657  *
6658  * NOTE: The value *actually* returned is the position of the last full
6659  * xlog page. It lags behind the real insert position by at most 1 page.
6660  * For that, we don't need to acquire WALInsertLock which can be quite
6661  * heavily contended, and an approximation is enough for the current
6662  * usage of this function.
6663  */
6664 XLogRecPtr
6665 GetInsertRecPtr(void)
6666 {
6667         /* use volatile pointer to prevent code rearrangement */
6668         volatile XLogCtlData *xlogctl = XLogCtl;
6669         XLogRecPtr      recptr;
6670
6671         SpinLockAcquire(&xlogctl->info_lck);
6672         recptr = xlogctl->LogwrtRqst.Write;
6673         SpinLockRelease(&xlogctl->info_lck);
6674
6675         return recptr;
6676 }
6677
6678 /*
6679  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6680  * position known to be fsync'd to disk.
6681  */
6682 XLogRecPtr
6683 GetFlushRecPtr(void)
6684 {
6685         /* use volatile pointer to prevent code rearrangement */
6686         volatile XLogCtlData *xlogctl = XLogCtl;
6687         XLogRecPtr      recptr;
6688
6689         SpinLockAcquire(&xlogctl->info_lck);
6690         recptr = xlogctl->LogwrtResult.Flush;
6691         SpinLockRelease(&xlogctl->info_lck);
6692
6693         return recptr;
6694 }
6695
6696 /*
6697  * Get the time of the last xlog segment switch
6698  */
6699 pg_time_t
6700 GetLastSegSwitchTime(void)
6701 {
6702         pg_time_t       result;
6703
6704         /* Need WALWriteLock, but shared lock is sufficient */
6705         LWLockAcquire(WALWriteLock, LW_SHARED);
6706         result = XLogCtl->Write.lastSegSwitchTime;
6707         LWLockRelease(WALWriteLock);
6708
6709         return result;
6710 }
6711
6712 /*
6713  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6714  *
6715  * This is exported for use by code that would like to have 64-bit XIDs.
6716  * We don't really support such things, but all XIDs within the system
6717  * can be presumed "close to" the result, and thus the epoch associated
6718  * with them can be determined.
6719  */
6720 void
6721 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6722 {
6723         uint32          ckptXidEpoch;
6724         TransactionId ckptXid;
6725         TransactionId nextXid;
6726
6727         /* Must read checkpoint info first, else have race condition */
6728         {
6729                 /* use volatile pointer to prevent code rearrangement */
6730                 volatile XLogCtlData *xlogctl = XLogCtl;
6731
6732                 SpinLockAcquire(&xlogctl->info_lck);
6733                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6734                 ckptXid = xlogctl->ckptXid;
6735                 SpinLockRelease(&xlogctl->info_lck);
6736         }
6737
6738         /* Now fetch current nextXid */
6739         nextXid = ReadNewTransactionId();
6740
6741         /*
6742          * nextXid is certainly logically later than ckptXid.  So if it's
6743          * numerically less, it must have wrapped into the next epoch.
6744          */
6745         if (nextXid < ckptXid)
6746                 ckptXidEpoch++;
6747
6748         *xid = nextXid;
6749         *epoch = ckptXidEpoch;
6750 }
6751
6752 /*
6753  * GetRecoveryTargetTLI - get the current recovery target timeline ID
6754  */
6755 TimeLineID
6756 GetRecoveryTargetTLI(void)
6757 {
6758         /* use volatile pointer to prevent code rearrangement */
6759         volatile XLogCtlData *xlogctl = XLogCtl;
6760         TimeLineID result;
6761
6762         SpinLockAcquire(&xlogctl->info_lck);
6763         result = xlogctl->RecoveryTargetTLI;
6764         SpinLockRelease(&xlogctl->info_lck);
6765
6766         return result;
6767 }
6768
6769 /*
6770  * This must be called ONCE during postmaster or standalone-backend shutdown
6771  */
6772 void
6773 ShutdownXLOG(int code, Datum arg)
6774 {
6775         ereport(LOG,
6776                         (errmsg("shutting down")));
6777
6778         if (RecoveryInProgress())
6779                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6780         else
6781         {
6782                 /*
6783                  * If archiving is enabled, rotate the last XLOG file so that all the
6784                  * remaining records are archived (postmaster wakes up the archiver
6785                  * process one more time at the end of shutdown). The checkpoint
6786                  * record will go to the next XLOG file and won't be archived (yet).
6787                  */
6788                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6789                         RequestXLogSwitch();
6790
6791                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6792         }
6793         ShutdownCLOG();
6794         ShutdownSUBTRANS();
6795         ShutdownMultiXact();
6796
6797         ereport(LOG,
6798                         (errmsg("database system is shut down")));
6799 }
6800
6801 /*
6802  * Log start of a checkpoint.
6803  */
6804 static void
6805 LogCheckpointStart(int flags, bool restartpoint)
6806 {
6807         const char *msg;
6808
6809         /*
6810          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
6811          * the main message, but what about all the flags?
6812          */
6813         if (restartpoint)
6814                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
6815         else
6816                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
6817
6818         elog(LOG, msg,
6819                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6820                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6821                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6822                  (flags & CHECKPOINT_FORCE) ? " force" : "",
6823                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
6824                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
6825                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
6826 }
6827
6828 /*
6829  * Log end of a checkpoint.
6830  */
6831 static void
6832 LogCheckpointEnd(bool restartpoint)
6833 {
6834         long            write_secs,
6835                                 sync_secs,
6836                                 total_secs,
6837                                 longest_secs,
6838                                 average_secs;
6839         int                     write_usecs,
6840                                 sync_usecs,
6841                                 total_usecs,
6842                                 longest_usecs,
6843                                 average_usecs;
6844         uint64          average_sync_time;
6845
6846         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6847
6848         TimestampDifference(CheckpointStats.ckpt_write_t,
6849                                                 CheckpointStats.ckpt_sync_t,
6850                                                 &write_secs, &write_usecs);
6851
6852         TimestampDifference(CheckpointStats.ckpt_sync_t,
6853                                                 CheckpointStats.ckpt_sync_end_t,
6854                                                 &sync_secs, &sync_usecs);
6855
6856         /* Accumulate checkpoint timing summary data, in milliseconds. */
6857         BgWriterStats.m_checkpoint_write_time +=
6858                 write_secs * 1000 + write_usecs / 1000;
6859         BgWriterStats.m_checkpoint_sync_time +=
6860                 sync_secs * 1000 + sync_usecs / 1000;
6861
6862         /*
6863          * All of the published timing statistics are accounted for.  Only
6864          * continue if a log message is to be written.
6865          */
6866         if (!log_checkpoints)
6867                 return;
6868
6869         TimestampDifference(CheckpointStats.ckpt_start_t,
6870                                                 CheckpointStats.ckpt_end_t,
6871                                                 &total_secs, &total_usecs);
6872
6873         /*
6874          * Timing values returned from CheckpointStats are in microseconds.
6875          * Convert to the second plus microsecond form that TimestampDifference
6876          * returns for homogeneous printing.
6877          */
6878         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
6879         longest_usecs = CheckpointStats.ckpt_longest_sync -
6880                 (uint64) longest_secs *1000000;
6881
6882         average_sync_time = 0;
6883         if (CheckpointStats.ckpt_sync_rels > 0)
6884                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6885                         CheckpointStats.ckpt_sync_rels;
6886         average_secs = (long) (average_sync_time / 1000000);
6887         average_usecs = average_sync_time - (uint64) average_secs *1000000;
6888
6889         if (restartpoint)
6890                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
6891                          "%d transaction log file(s) added, %d removed, %d recycled; "
6892                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6893                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6894                          CheckpointStats.ckpt_bufs_written,
6895                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6896                          CheckpointStats.ckpt_segs_added,
6897                          CheckpointStats.ckpt_segs_removed,
6898                          CheckpointStats.ckpt_segs_recycled,
6899                          write_secs, write_usecs / 1000,
6900                          sync_secs, sync_usecs / 1000,
6901                          total_secs, total_usecs / 1000,
6902                          CheckpointStats.ckpt_sync_rels,
6903                          longest_secs, longest_usecs / 1000,
6904                          average_secs, average_usecs / 1000);
6905         else
6906                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
6907                          "%d transaction log file(s) added, %d removed, %d recycled; "
6908                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6909                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6910                          CheckpointStats.ckpt_bufs_written,
6911                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6912                          CheckpointStats.ckpt_segs_added,
6913                          CheckpointStats.ckpt_segs_removed,
6914                          CheckpointStats.ckpt_segs_recycled,
6915                          write_secs, write_usecs / 1000,
6916                          sync_secs, sync_usecs / 1000,
6917                          total_secs, total_usecs / 1000,
6918                          CheckpointStats.ckpt_sync_rels,
6919                          longest_secs, longest_usecs / 1000,
6920                          average_secs, average_usecs / 1000);
6921 }
6922
6923 /*
6924  * Perform a checkpoint --- either during shutdown, or on-the-fly
6925  *
6926  * flags is a bitwise OR of the following:
6927  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6928  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6929  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6930  *              ignoring checkpoint_completion_target parameter.
6931  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
6932  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6933  *              CHECKPOINT_END_OF_RECOVERY).
6934  *
6935  * Note: flags contains other bits, of interest here only for logging purposes.
6936  * In particular note that this routine is synchronous and does not pay
6937  * attention to CHECKPOINT_WAIT.
6938  *
6939  * If !shutdown then we are writing an online checkpoint. This is a very special
6940  * kind of operation and WAL record because the checkpoint action occurs over
6941  * a period of time yet logically occurs at just a single LSN. The logical
6942  * position of the WAL record (redo ptr) is the same or earlier than the
6943  * physical position. When we replay WAL we locate the checkpoint via its
6944  * physical position then read the redo ptr and actually start replay at the
6945  * earlier logical position. Note that we don't write *anything* to WAL at
6946  * the logical position, so that location could be any other kind of WAL record.
6947  * All of this mechanism allows us to continue working while we checkpoint.
6948  * As a result, timing of actions is critical here and be careful to note that
6949  * this function will likely take minutes to execute on a busy system.
6950  */
6951 void
6952 CreateCheckPoint(int flags)
6953 {
6954         bool            shutdown;
6955         CheckPoint      checkPoint;
6956         XLogRecPtr      recptr;
6957         XLogCtlInsert *Insert = &XLogCtl->Insert;
6958         XLogRecData rdata;
6959         uint32          freespace;
6960         XLogSegNo       _logSegNo;
6961         VirtualTransactionId *vxids;
6962         int     nvxids;
6963
6964         /*
6965          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6966          * issued at a different time.
6967          */
6968         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6969                 shutdown = true;
6970         else
6971                 shutdown = false;
6972
6973         /* sanity check */
6974         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6975                 elog(ERROR, "can't create a checkpoint during recovery");
6976
6977         /*
6978          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
6979          * (This is just pro forma, since in the present system structure there is
6980          * only one process that is allowed to issue checkpoints at any given
6981          * time.)
6982          */
6983         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6984
6985         /*
6986          * Prepare to accumulate statistics.
6987          *
6988          * Note: because it is possible for log_checkpoints to change while a
6989          * checkpoint proceeds, we always accumulate stats, even if
6990          * log_checkpoints is currently off.
6991          */
6992         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6993         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6994
6995         /*
6996          * Use a critical section to force system panic if we have trouble.
6997          */
6998         START_CRIT_SECTION();
6999
7000         if (shutdown)
7001         {
7002                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7003                 ControlFile->state = DB_SHUTDOWNING;
7004                 ControlFile->time = (pg_time_t) time(NULL);
7005                 UpdateControlFile();
7006                 LWLockRelease(ControlFileLock);
7007         }
7008
7009         /*
7010          * Let smgr prepare for checkpoint; this has to happen before we determine
7011          * the REDO pointer.  Note that smgr must not do anything that'd have to
7012          * be undone if we decide no checkpoint is needed.
7013          */
7014         smgrpreckpt();
7015
7016         /* Begin filling in the checkpoint WAL record */
7017         MemSet(&checkPoint, 0, sizeof(checkPoint));
7018         checkPoint.time = (pg_time_t) time(NULL);
7019
7020         /*
7021          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7022          * pointer. This allows us to begin accumulating changes to assemble our
7023          * starting snapshot of locks and transactions.
7024          */
7025         if (!shutdown && XLogStandbyInfoActive())
7026                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7027         else
7028                 checkPoint.oldestActiveXid = InvalidTransactionId;
7029
7030         /*
7031          * We must hold WALInsertLock while examining insert state to determine
7032          * the checkpoint REDO pointer.
7033          */
7034         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7035
7036         /*
7037          * If this isn't a shutdown or forced checkpoint, and we have not inserted
7038          * any XLOG records since the start of the last checkpoint, skip the
7039          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7040          * when the system is idle. That wastes log space, and more importantly it
7041          * exposes us to possible loss of both current and previous checkpoint
7042          * records if the machine crashes just as we're writing the update.
7043          * (Perhaps it'd make even more sense to checkpoint only when the previous
7044          * checkpoint record is in a different xlog page?)
7045          *
7046          * We have to make two tests to determine that nothing has happened since
7047          * the start of the last checkpoint: current insertion point must match
7048          * the end of the last checkpoint record, and its redo pointer must point
7049          * to itself.
7050          */
7051         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7052                                   CHECKPOINT_FORCE)) == 0)
7053         {
7054                 XLogRecPtr      curInsert;
7055
7056                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
7057                 if (curInsert == ControlFile->checkPoint +
7058                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
7059                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
7060                 {
7061                         LWLockRelease(WALInsertLock);
7062                         LWLockRelease(CheckpointLock);
7063                         END_CRIT_SECTION();
7064                         return;
7065                 }
7066         }
7067
7068         /*
7069          * An end-of-recovery checkpoint is created before anyone is allowed to
7070          * write WAL. To allow us to write the checkpoint record, temporarily
7071          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7072          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7073          */
7074         if (flags & CHECKPOINT_END_OF_RECOVERY)
7075                 LocalSetXLogInsertAllowed();
7076
7077         checkPoint.ThisTimeLineID = ThisTimeLineID;
7078         checkPoint.fullPageWrites = Insert->fullPageWrites;
7079
7080         /*
7081          * Compute new REDO record ptr = location of next XLOG record.
7082          *
7083          * NB: this is NOT necessarily where the checkpoint record itself will be,
7084          * since other backends may insert more XLOG records while we're off doing
7085          * the buffer flush work.  Those XLOG records are logically after the
7086          * checkpoint, even though physically before it.  Got that?
7087          */
7088         freespace = INSERT_FREESPACE(Insert);
7089         if (freespace == 0)
7090         {
7091                 (void) AdvanceXLInsertBuffer(false);
7092                 /* OK to ignore update return flag, since we will do flush anyway */
7093                 freespace = INSERT_FREESPACE(Insert);
7094         }
7095         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
7096
7097         /*
7098          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7099          * must be done while holding the insert lock AND the info_lck.
7100          *
7101          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7102          * pointing past where it really needs to point.  This is okay; the only
7103          * consequence is that XLogInsert might back up whole buffers that it
7104          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7105          * XLogInserts that happen while we are dumping buffers must assume that
7106          * their buffer changes are not included in the checkpoint.
7107          */
7108         {
7109                 /* use volatile pointer to prevent code rearrangement */
7110                 volatile XLogCtlData *xlogctl = XLogCtl;
7111
7112                 SpinLockAcquire(&xlogctl->info_lck);
7113                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7114                 SpinLockRelease(&xlogctl->info_lck);
7115         }
7116
7117         /*
7118          * Now we can release WAL insert lock, allowing other xacts to proceed
7119          * while we are flushing disk buffers.
7120          */
7121         LWLockRelease(WALInsertLock);
7122
7123         /*
7124          * If enabled, log checkpoint start.  We postpone this until now so as not
7125          * to log anything if we decided to skip the checkpoint.
7126          */
7127         if (log_checkpoints)
7128                 LogCheckpointStart(flags, false);
7129
7130         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7131
7132         /*
7133          * In some cases there are groups of actions that must all occur on
7134          * one side or the other of a checkpoint record. Before flushing the
7135          * checkpoint record we must explicitly wait for any backend currently
7136          * performing those groups of actions.
7137          *
7138          * One example is end of transaction, so we must wait for any transactions
7139          * that are currently in commit critical sections.  If an xact inserted
7140          * its commit record into XLOG just before the REDO point, then a crash
7141          * restart from the REDO point would not replay that record, which means
7142          * that our flushing had better include the xact's update of pg_clog.  So
7143          * we wait till he's out of his commit critical section before proceeding.
7144          * See notes in RecordTransactionCommit().
7145          *
7146          * Because we've already released WALInsertLock, this test is a bit fuzzy:
7147          * it is possible that we will wait for xacts we didn't really need to
7148          * wait for.  But the delay should be short and it seems better to make
7149          * checkpoint take a bit longer than to hold locks longer than necessary.
7150          * (In fact, the whole reason we have this issue is that xact.c does
7151          * commit record XLOG insertion and clog update as two separate steps
7152          * protected by different locks, but again that seems best on grounds of
7153          * minimizing lock contention.)
7154          *
7155          * A transaction that has not yet set delayChkpt when we look cannot be at
7156          * risk, since he's not inserted his commit record yet; and one that's
7157          * already cleared it is not at risk either, since he's done fixing clog
7158          * and we will correctly flush the update below.  So we cannot miss any
7159          * xacts we need to wait for.
7160          */
7161         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
7162         if (nvxids > 0)
7163         {
7164                 uint32  nwaits = 0;
7165
7166                 do
7167                 {
7168                         pg_usleep(10000L);      /* wait for 10 msec */
7169                         nwaits++;
7170                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
7171         }
7172         pfree(vxids);
7173
7174         /*
7175          * Get the other info we need for the checkpoint record.
7176          */
7177         LWLockAcquire(XidGenLock, LW_SHARED);
7178         checkPoint.nextXid = ShmemVariableCache->nextXid;
7179         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7180         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7181         LWLockRelease(XidGenLock);
7182
7183         /* Increase XID epoch if we've wrapped around since last checkpoint */
7184         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7185         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7186                 checkPoint.nextXidEpoch++;
7187
7188         LWLockAcquire(OidGenLock, LW_SHARED);
7189         checkPoint.nextOid = ShmemVariableCache->nextOid;
7190         if (!shutdown)
7191                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7192         LWLockRelease(OidGenLock);
7193
7194         MultiXactGetCheckptMulti(shutdown,
7195                                                          &checkPoint.nextMulti,
7196                                                          &checkPoint.nextMultiOffset);
7197
7198         /*
7199          * Having constructed the checkpoint record, ensure all shmem disk buffers
7200          * and commit-log buffers are flushed to disk.
7201          *
7202          * This I/O could fail for various reasons.  If so, we will fail to
7203          * complete the checkpoint, but there is no reason to force a system
7204          * panic. Accordingly, exit critical section while doing it.
7205          */
7206         END_CRIT_SECTION();
7207
7208         CheckPointGuts(checkPoint.redo, flags);
7209
7210         /*
7211          * Take a snapshot of running transactions and write this to WAL. This
7212          * allows us to reconstruct the state of running transactions during
7213          * archive recovery, if required. Skip, if this info disabled.
7214          *
7215          * If we are shutting down, or Startup process is completing crash
7216          * recovery we don't need to write running xact data.
7217          */
7218         if (!shutdown && XLogStandbyInfoActive())
7219                 LogStandbySnapshot();
7220
7221         START_CRIT_SECTION();
7222
7223         /*
7224          * Now insert the checkpoint record into XLOG.
7225          */
7226         rdata.data = (char *) (&checkPoint);
7227         rdata.len = sizeof(checkPoint);
7228         rdata.buffer = InvalidBuffer;
7229         rdata.next = NULL;
7230
7231         recptr = XLogInsert(RM_XLOG_ID,
7232                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7233                                                 XLOG_CHECKPOINT_ONLINE,
7234                                                 &rdata);
7235
7236         XLogFlush(recptr);
7237
7238         /*
7239          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7240          * overwritten at next startup.  No-one should even try, this just allows
7241          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7242          * to just temporarily disable writing until the system has exited
7243          * recovery.
7244          */
7245         if (shutdown)
7246         {
7247                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7248                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7249                 else
7250                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7251         }
7252
7253         /*
7254          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7255          * = end of actual checkpoint record.
7256          */
7257         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7258                 ereport(PANIC,
7259                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7260
7261         /*
7262          * Select point at which we can truncate the log, which we base on the
7263          * prior checkpoint's earliest info.
7264          */
7265         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7266
7267         /*
7268          * Update the control file.
7269          */
7270         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7271         if (shutdown)
7272                 ControlFile->state = DB_SHUTDOWNED;
7273         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7274         ControlFile->checkPoint = ProcLastRecPtr;
7275         ControlFile->checkPointCopy = checkPoint;
7276         ControlFile->time = (pg_time_t) time(NULL);
7277         /* crash recovery should always recover to the end of WAL */
7278         MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7279         ControlFile->minRecoveryPointTLI = 0;
7280         UpdateControlFile();
7281         LWLockRelease(ControlFileLock);
7282
7283         /* Update shared-memory copy of checkpoint XID/epoch */
7284         {
7285                 /* use volatile pointer to prevent code rearrangement */
7286                 volatile XLogCtlData *xlogctl = XLogCtl;
7287
7288                 SpinLockAcquire(&xlogctl->info_lck);
7289                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7290                 xlogctl->ckptXid = checkPoint.nextXid;
7291                 SpinLockRelease(&xlogctl->info_lck);
7292         }
7293
7294         /*
7295          * We are now done with critical updates; no need for system panic if we
7296          * have trouble while fooling with old log segments.
7297          */
7298         END_CRIT_SECTION();
7299
7300         /*
7301          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7302          */
7303         smgrpostckpt();
7304
7305         /*
7306          * Delete old log files (those no longer needed even for previous
7307          * checkpoint or the standbys in XLOG streaming).
7308          */
7309         if (_logSegNo)
7310         {
7311                 KeepLogSeg(recptr, &_logSegNo);
7312                 _logSegNo--;
7313                 RemoveOldXlogFiles(_logSegNo, recptr);
7314         }
7315
7316         /*
7317          * Make more log segments if needed.  (Do this after recycling old log
7318          * segments, since that may supply some of the needed files.)
7319          */
7320         if (!shutdown)
7321                 PreallocXlogFiles(recptr);
7322
7323         /*
7324          * Truncate pg_subtrans if possible.  We can throw away all data before
7325          * the oldest XMIN of any running transaction.  No future transaction will
7326          * attempt to reference any pg_subtrans entry older than that (see Asserts
7327          * in subtrans.c).      During recovery, though, we mustn't do this because
7328          * StartupSUBTRANS hasn't been called yet.
7329          */
7330         if (!RecoveryInProgress())
7331                 TruncateSUBTRANS(GetOldestXmin(true, false));
7332
7333         /* Real work is done, but log and update stats before releasing lock. */
7334         LogCheckpointEnd(false);
7335
7336         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7337                                                                          NBuffers,
7338                                                                          CheckpointStats.ckpt_segs_added,
7339                                                                          CheckpointStats.ckpt_segs_removed,
7340                                                                          CheckpointStats.ckpt_segs_recycled);
7341
7342         LWLockRelease(CheckpointLock);
7343 }
7344
7345 /*
7346  * Flush all data in shared memory to disk, and fsync
7347  *
7348  * This is the common code shared between regular checkpoints and
7349  * recovery restartpoints.
7350  */
7351 static void
7352 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7353 {
7354         CheckPointCLOG();
7355         CheckPointSUBTRANS();
7356         CheckPointMultiXact();
7357         CheckPointPredicate();
7358         CheckPointRelationMap();
7359         CheckPointBuffers(flags);       /* performs all required fsyncs */
7360         /* We deliberately delay 2PC checkpointing as long as possible */
7361         CheckPointTwoPhase(checkPointRedo);
7362 }
7363
7364 /*
7365  * Save a checkpoint for recovery restart if appropriate
7366  *
7367  * This function is called each time a checkpoint record is read from XLOG.
7368  * It must determine whether the checkpoint represents a safe restartpoint or
7369  * not.  If so, the checkpoint record is stashed in shared memory so that
7370  * CreateRestartPoint can consult it.  (Note that the latter function is
7371  * executed by the checkpointer, while this one will be executed by the
7372  * startup process.)
7373  */
7374 static void
7375 RecoveryRestartPoint(const CheckPoint *checkPoint)
7376 {
7377         int                     rmid;
7378
7379         /* use volatile pointer to prevent code rearrangement */
7380         volatile XLogCtlData *xlogctl = XLogCtl;
7381
7382         /*
7383          * Is it safe to restartpoint?  We must ask each of the resource managers
7384          * whether they have any partial state information that might prevent a
7385          * correct restart from this point.  If so, we skip this opportunity, but
7386          * return at the next checkpoint record for another try.
7387          */
7388         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7389         {
7390                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
7391                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7392                         {
7393                                 elog(trace_recovery(DEBUG2),
7394                                          "RM %d not safe to record restart point at %X/%X",
7395                                          rmid,
7396                                          (uint32) (checkPoint->redo >> 32),
7397                                          (uint32) checkPoint->redo);
7398                                 return;
7399                         }
7400         }
7401
7402         /*
7403          * Also refrain from creating a restartpoint if we have seen any
7404          * references to non-existent pages. Restarting recovery from the
7405          * restartpoint would not see the references, so we would lose the
7406          * cross-check that the pages belonged to a relation that was dropped
7407          * later.
7408          */
7409         if (XLogHaveInvalidPages())
7410         {
7411                 elog(trace_recovery(DEBUG2),
7412                          "could not record restart point at %X/%X because there "
7413                          "are unresolved references to invalid pages",
7414                          (uint32) (checkPoint->redo >> 32),
7415                          (uint32) checkPoint->redo);
7416                 return;
7417         }
7418
7419         /*
7420          * Copy the checkpoint record to shared memory, so that checkpointer can
7421          * work out the next time it wants to perform a restartpoint.
7422          */
7423         SpinLockAcquire(&xlogctl->info_lck);
7424         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
7425         xlogctl->lastCheckPoint = *checkPoint;
7426         SpinLockRelease(&xlogctl->info_lck);
7427 }
7428
7429 /*
7430  * Establish a restartpoint if possible.
7431  *
7432  * This is similar to CreateCheckPoint, but is used during WAL recovery
7433  * to establish a point from which recovery can roll forward without
7434  * replaying the entire recovery log.
7435  *
7436  * Returns true if a new restartpoint was established. We can only establish
7437  * a restartpoint if we have replayed a safe checkpoint record since last
7438  * restartpoint.
7439  */
7440 bool
7441 CreateRestartPoint(int flags)
7442 {
7443         XLogRecPtr      lastCheckPointRecPtr;
7444         CheckPoint      lastCheckPoint;
7445         XLogSegNo       _logSegNo;
7446         TimestampTz xtime;
7447
7448         /* use volatile pointer to prevent code rearrangement */
7449         volatile XLogCtlData *xlogctl = XLogCtl;
7450
7451         /*
7452          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
7453          * happens at a time.
7454          */
7455         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7456
7457         /* Get a local copy of the last safe checkpoint record. */
7458         SpinLockAcquire(&xlogctl->info_lck);
7459         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
7460         lastCheckPoint = xlogctl->lastCheckPoint;
7461         SpinLockRelease(&xlogctl->info_lck);
7462
7463         /*
7464          * Check that we're still in recovery mode. It's ok if we exit recovery
7465          * mode after this check, the restart point is valid anyway.
7466          */
7467         if (!RecoveryInProgress())
7468         {
7469                 ereport(DEBUG2,
7470                           (errmsg("skipping restartpoint, recovery has already ended")));
7471                 LWLockRelease(CheckpointLock);
7472                 return false;
7473         }
7474
7475         /*
7476          * If the last checkpoint record we've replayed is already our last
7477          * restartpoint, we can't perform a new restart point. We still update
7478          * minRecoveryPoint in that case, so that if this is a shutdown restart
7479          * point, we won't start up earlier than before. That's not strictly
7480          * necessary, but when hot standby is enabled, it would be rather weird if
7481          * the database opened up for read-only connections at a point-in-time
7482          * before the last shutdown. Such time travel is still possible in case of
7483          * immediate shutdown, though.
7484          *
7485          * We don't explicitly advance minRecoveryPoint when we do create a
7486          * restartpoint. It's assumed that flushing the buffers will do that as a
7487          * side-effect.
7488          */
7489         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7490                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
7491         {
7492                 ereport(DEBUG2,
7493                                 (errmsg("skipping restartpoint, already performed at %X/%X",
7494                                                 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));
7495
7496                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7497                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7498                 {
7499                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7500                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7501                         ControlFile->time = (pg_time_t) time(NULL);
7502                         UpdateControlFile();
7503                         LWLockRelease(ControlFileLock);
7504                 }
7505                 LWLockRelease(CheckpointLock);
7506                 return false;
7507         }
7508
7509         /*
7510          * Update the shared RedoRecPtr so that the startup process can calculate
7511          * the number of segments replayed since last restartpoint, and request a
7512          * restartpoint if it exceeds checkpoint_segments.
7513          *
7514          * You need to hold WALInsertLock and info_lck to update it, although
7515          * during recovery acquiring WALInsertLock is just pro forma, because
7516          * there is no other processes updating Insert.RedoRecPtr.
7517          */
7518         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7519         SpinLockAcquire(&xlogctl->info_lck);
7520         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
7521         SpinLockRelease(&xlogctl->info_lck);
7522         LWLockRelease(WALInsertLock);
7523
7524         /*
7525          * Prepare to accumulate statistics.
7526          *
7527          * Note: because it is possible for log_checkpoints to change while a
7528          * checkpoint proceeds, we always accumulate stats, even if
7529          * log_checkpoints is currently off.
7530          */
7531         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7532         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7533
7534         if (log_checkpoints)
7535                 LogCheckpointStart(flags, true);
7536
7537         CheckPointGuts(lastCheckPoint.redo, flags);
7538
7539         /*
7540          * Select point at which we can truncate the xlog, which we base on the
7541          * prior checkpoint's earliest info.
7542          */
7543         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7544
7545         /*
7546          * Update pg_control, using current time.  Check that it still shows
7547          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
7548          * this is a quick hack to make sure nothing really bad happens if somehow
7549          * we get here after the end-of-recovery checkpoint.
7550          */
7551         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7552         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
7553                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
7554         {
7555                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
7556                 ControlFile->checkPoint = lastCheckPointRecPtr;
7557                 ControlFile->checkPointCopy = lastCheckPoint;
7558                 ControlFile->time = (pg_time_t) time(NULL);
7559                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7560                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7561                 UpdateControlFile();
7562         }
7563         LWLockRelease(ControlFileLock);
7564
7565         /*
7566          * Delete old log files (those no longer needed even for previous
7567          * checkpoint/restartpoint) to prevent the disk holding the xlog from
7568          * growing full.
7569          */
7570         if (_logSegNo)
7571         {
7572                 XLogRecPtr      endptr;
7573
7574                 /* Get the current (or recent) end of xlog */
7575                 endptr = GetStandbyFlushRecPtr(NULL);
7576
7577                 KeepLogSeg(endptr, &_logSegNo);
7578                 _logSegNo--;
7579                 RemoveOldXlogFiles(_logSegNo, endptr);
7580
7581                 /*
7582                  * Make more log segments if needed.  (Do this after recycling old log
7583                  * segments, since that may supply some of the needed files.)
7584                  */
7585                 PreallocXlogFiles(endptr);
7586         }
7587
7588         /*
7589          * Truncate pg_subtrans if possible.  We can throw away all data before
7590          * the oldest XMIN of any running transaction.  No future transaction will
7591          * attempt to reference any pg_subtrans entry older than that (see Asserts
7592          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
7593          * this because StartupSUBTRANS hasn't been called yet.
7594          */
7595         if (EnableHotStandby)
7596                 TruncateSUBTRANS(GetOldestXmin(true, false));
7597
7598         /* Real work is done, but log and update before releasing lock. */
7599         LogCheckpointEnd(true);
7600
7601         xtime = GetLatestXTime();
7602         ereport((log_checkpoints ? LOG : DEBUG2),
7603                         (errmsg("recovery restart point at %X/%X",
7604                                         (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
7605                    xtime ? errdetail("last completed transaction was at log time %s",
7606                                                          timestamptz_to_str(xtime)) : 0));
7607
7608         LWLockRelease(CheckpointLock);
7609
7610         /*
7611          * Finally, execute archive_cleanup_command, if any.
7612          */
7613         if (XLogCtl->archiveCleanupCommand[0])
7614                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
7615                                                            "archive_cleanup_command",
7616                                                            false);
7617
7618         return true;
7619 }
7620
7621 /*
7622  * Calculate the last segment that we need to retain because of
7623  * wal_keep_segments, by subtracting wal_keep_segments from
7624  * the given xlog location, recptr.
7625  */
7626 static void
7627 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
7628 {
7629         XLogSegNo       segno;
7630
7631         if (wal_keep_segments == 0)
7632                 return;
7633
7634         XLByteToSeg(recptr, segno);
7635
7636         /* avoid underflow, don't go below 1 */
7637         if (segno <= wal_keep_segments)
7638                 segno = 1;
7639         else
7640                 segno = *logSegNo - wal_keep_segments;
7641
7642         /* don't delete WAL segments newer than the calculated segment */
7643         if (segno < *logSegNo)
7644                 *logSegNo = segno;
7645 }
7646
7647 /*
7648  * Write a NEXTOID log record
7649  */
7650 void
7651 XLogPutNextOid(Oid nextOid)
7652 {
7653         XLogRecData rdata;
7654
7655         rdata.data = (char *) (&nextOid);
7656         rdata.len = sizeof(Oid);
7657         rdata.buffer = InvalidBuffer;
7658         rdata.next = NULL;
7659         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
7660
7661         /*
7662          * We need not flush the NEXTOID record immediately, because any of the
7663          * just-allocated OIDs could only reach disk as part of a tuple insert or
7664          * update that would have its own XLOG record that must follow the NEXTOID
7665          * record.      Therefore, the standard buffer LSN interlock applied to those
7666          * records will ensure no such OID reaches disk before the NEXTOID record
7667          * does.
7668          *
7669          * Note, however, that the above statement only covers state "within" the
7670          * database.  When we use a generated OID as a file or directory name, we
7671          * are in a sense violating the basic WAL rule, because that filesystem
7672          * change may reach disk before the NEXTOID WAL record does.  The impact
7673          * of this is that if a database crash occurs immediately afterward, we
7674          * might after restart re-generate the same OID and find that it conflicts
7675          * with the leftover file or directory.  But since for safety's sake we
7676          * always loop until finding a nonconflicting filename, this poses no real
7677          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7678          */
7679 }
7680
7681 /*
7682  * Write an XLOG SWITCH record.
7683  *
7684  * Here we just blindly issue an XLogInsert request for the record.
7685  * All the magic happens inside XLogInsert.
7686  *
7687  * The return value is either the end+1 address of the switch record,
7688  * or the end+1 address of the prior segment if we did not need to
7689  * write a switch record because we are already at segment start.
7690  */
7691 XLogRecPtr
7692 RequestXLogSwitch(void)
7693 {
7694         XLogRecPtr      RecPtr;
7695         XLogRecData rdata;
7696
7697         /* XLOG SWITCH, alone among xlog record types, has no data */
7698         rdata.buffer = InvalidBuffer;
7699         rdata.data = NULL;
7700         rdata.len = 0;
7701         rdata.next = NULL;
7702
7703         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
7704
7705         return RecPtr;
7706 }
7707
7708 /*
7709  * Write a RESTORE POINT record
7710  */
7711 XLogRecPtr
7712 XLogRestorePoint(const char *rpName)
7713 {
7714         XLogRecPtr      RecPtr;
7715         XLogRecData rdata;
7716         xl_restore_point xlrec;
7717
7718         xlrec.rp_time = GetCurrentTimestamp();
7719         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
7720
7721         rdata.buffer = InvalidBuffer;
7722         rdata.data = (char *) &xlrec;
7723         rdata.len = sizeof(xl_restore_point);
7724         rdata.next = NULL;
7725
7726         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
7727
7728         ereport(LOG,
7729                         (errmsg("restore point \"%s\" created at %X/%X",
7730                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
7731
7732         return RecPtr;
7733 }
7734
7735 /*
7736  * Check if any of the GUC parameters that are critical for hot standby
7737  * have changed, and update the value in pg_control file if necessary.
7738  */
7739 static void
7740 XLogReportParameters(void)
7741 {
7742         if (wal_level != ControlFile->wal_level ||
7743                 MaxConnections != ControlFile->MaxConnections ||
7744                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
7745                 max_locks_per_xact != ControlFile->max_locks_per_xact)
7746         {
7747                 /*
7748                  * The change in number of backend slots doesn't need to be WAL-logged
7749                  * if archiving is not enabled, as you can't start archive recovery
7750                  * with wal_level=minimal anyway. We don't really care about the
7751                  * values in pg_control either if wal_level=minimal, but seems better
7752                  * to keep them up-to-date to avoid confusion.
7753                  */
7754                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
7755                 {
7756                         XLogRecData rdata;
7757                         xl_parameter_change xlrec;
7758
7759                         xlrec.MaxConnections = MaxConnections;
7760                         xlrec.max_prepared_xacts = max_prepared_xacts;
7761                         xlrec.max_locks_per_xact = max_locks_per_xact;
7762                         xlrec.wal_level = wal_level;
7763
7764                         rdata.buffer = InvalidBuffer;
7765                         rdata.data = (char *) &xlrec;
7766                         rdata.len = sizeof(xlrec);
7767                         rdata.next = NULL;
7768
7769                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
7770                 }
7771
7772                 ControlFile->MaxConnections = MaxConnections;
7773                 ControlFile->max_prepared_xacts = max_prepared_xacts;
7774                 ControlFile->max_locks_per_xact = max_locks_per_xact;
7775                 ControlFile->wal_level = wal_level;
7776                 UpdateControlFile();
7777         }
7778 }
7779
7780 /*
7781  * Update full_page_writes in shared memory, and write an
7782  * XLOG_FPW_CHANGE record if necessary.
7783  *
7784  * Note: this function assumes there is no other process running
7785  * concurrently that could update it.
7786  */
7787 void
7788 UpdateFullPageWrites(void)
7789 {
7790         XLogCtlInsert *Insert = &XLogCtl->Insert;
7791
7792         /*
7793          * Do nothing if full_page_writes has not been changed.
7794          *
7795          * It's safe to check the shared full_page_writes without the lock,
7796          * because we assume that there is no concurrently running process which
7797          * can update it.
7798          */
7799         if (fullPageWrites == Insert->fullPageWrites)
7800                 return;
7801
7802         START_CRIT_SECTION();
7803
7804         /*
7805          * It's always safe to take full page images, even when not strictly
7806          * required, but not the other round. So if we're setting full_page_writes
7807          * to true, first set it true and then write the WAL record. If we're
7808          * setting it to false, first write the WAL record and then set the global
7809          * flag.
7810          */
7811         if (fullPageWrites)
7812         {
7813                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7814                 Insert->fullPageWrites = true;
7815                 LWLockRelease(WALInsertLock);
7816         }
7817
7818         /*
7819          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
7820          * full_page_writes during archive recovery, if required.
7821          */
7822         if (XLogStandbyInfoActive() && !RecoveryInProgress())
7823         {
7824                 XLogRecData rdata;
7825
7826                 rdata.data = (char *) (&fullPageWrites);
7827                 rdata.len = sizeof(bool);
7828                 rdata.buffer = InvalidBuffer;
7829                 rdata.next = NULL;
7830
7831                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
7832         }
7833
7834         if (!fullPageWrites)
7835         {
7836                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7837                 Insert->fullPageWrites = false;
7838                 LWLockRelease(WALInsertLock);
7839         }
7840         END_CRIT_SECTION();
7841 }
7842
7843 /*
7844  * XLOG resource manager's routines
7845  *
7846  * Definitions of info values are in include/catalog/pg_control.h, though
7847  * not all record types are related to control file updates.
7848  */
7849 void
7850 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
7851 {
7852         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7853
7854         /* Backup blocks are not used in xlog records */
7855         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
7856
7857         if (info == XLOG_NEXTOID)
7858         {
7859                 Oid                     nextOid;
7860
7861                 /*
7862                  * We used to try to take the maximum of ShmemVariableCache->nextOid
7863                  * and the recorded nextOid, but that fails if the OID counter wraps
7864                  * around.      Since no OID allocation should be happening during replay
7865                  * anyway, better to just believe the record exactly.  We still take
7866                  * OidGenLock while setting the variable, just in case.
7867                  */
7868                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
7869                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7870                 ShmemVariableCache->nextOid = nextOid;
7871                 ShmemVariableCache->oidCount = 0;
7872                 LWLockRelease(OidGenLock);
7873         }
7874         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
7875         {
7876                 CheckPoint      checkPoint;
7877
7878                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7879                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
7880                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7881                 ShmemVariableCache->nextXid = checkPoint.nextXid;
7882                 LWLockRelease(XidGenLock);
7883                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7884                 ShmemVariableCache->nextOid = checkPoint.nextOid;
7885                 ShmemVariableCache->oidCount = 0;
7886                 LWLockRelease(OidGenLock);
7887                 MultiXactSetNextMXact(checkPoint.nextMulti,
7888                                                           checkPoint.nextMultiOffset);
7889                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
7890
7891                 /*
7892                  * If we see a shutdown checkpoint while waiting for an end-of-backup
7893                  * record, the backup was canceled and the end-of-backup record will
7894                  * never arrive.
7895                  */
7896                 if (InArchiveRecovery &&
7897                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
7898                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
7899                         ereport(PANIC,
7900                         (errmsg("online backup was canceled, recovery cannot continue")));
7901
7902                 /*
7903                  * If we see a shutdown checkpoint, we know that nothing was running
7904                  * on the master at this point. So fake-up an empty running-xacts
7905                  * record and use that here and now. Recover additional standby state
7906                  * for prepared transactions.
7907                  */
7908                 if (standbyState >= STANDBY_INITIALIZED)
7909                 {
7910                         TransactionId *xids;
7911                         int                     nxids;
7912                         TransactionId oldestActiveXID;
7913                         TransactionId latestCompletedXid;
7914                         RunningTransactionsData running;
7915
7916                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7917
7918                         /*
7919                          * Construct a RunningTransactions snapshot representing a shut
7920                          * down server, with only prepared transactions still alive. We're
7921                          * never overflowed at this point because all subxids are listed
7922                          * with their parent prepared transactions.
7923                          */
7924                         running.xcnt = nxids;
7925                         running.subxcnt = 0;
7926                         running.subxid_overflow = false;
7927                         running.nextXid = checkPoint.nextXid;
7928                         running.oldestRunningXid = oldestActiveXID;
7929                         latestCompletedXid = checkPoint.nextXid;
7930                         TransactionIdRetreat(latestCompletedXid);
7931                         Assert(TransactionIdIsNormal(latestCompletedXid));
7932                         running.latestCompletedXid = latestCompletedXid;
7933                         running.xids = xids;
7934
7935                         ProcArrayApplyRecoveryInfo(&running);
7936
7937                         StandbyRecoverPreparedTransactions(true);
7938                 }
7939
7940                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7941                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7942                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7943
7944                 /* Update shared-memory copy of checkpoint XID/epoch */
7945                 {
7946                         /* use volatile pointer to prevent code rearrangement */
7947                         volatile XLogCtlData *xlogctl = XLogCtl;
7948
7949                         SpinLockAcquire(&xlogctl->info_lck);
7950                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7951                         xlogctl->ckptXid = checkPoint.nextXid;
7952                         SpinLockRelease(&xlogctl->info_lck);
7953                 }
7954
7955                 /*
7956                  * TLI may change in a shutdown checkpoint.
7957                  */
7958                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7959                 {
7960                         /*
7961                          * The new timeline better be in the list of timelines we expect
7962                          * to see, according to the timeline history. It should also not
7963                          * decrease.
7964                          */
7965                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
7966                                 !tliInHistory(checkPoint.ThisTimeLineID, expectedTLEs))
7967                                 ereport(PANIC,
7968                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7969                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
7970
7971                         /*
7972                          * If we have not yet reached min recovery point, and we're about
7973                          * to switch to a timeline greater than the timeline of the min
7974                          * recovery point: trouble. After switching to the new timeline,
7975                          * we could not possibly visit the min recovery point on the
7976                          * correct timeline anymore. This can happen if there is a newer
7977                          * timeline in the archive that branched before the timeline the
7978                          * min recovery point is on, and you attempt to do PITR to the
7979                          * new timeline.
7980                          */
7981                         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
7982                                 XLByteLT(lsn, minRecoveryPoint) &&
7983                                 checkPoint.ThisTimeLineID > minRecoveryPointTLI)
7984                                 ereport(PANIC,
7985                                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
7986                                                                 checkPoint.ThisTimeLineID,
7987                                                                 (uint32) (minRecoveryPoint >> 32),
7988                                                                 (uint32) minRecoveryPoint,
7989                                                                 minRecoveryPointTLI)));
7990
7991                         /* Following WAL records should be run with new TLI */
7992                         ThisTimeLineID = checkPoint.ThisTimeLineID;
7993                 }
7994
7995                 RecoveryRestartPoint(&checkPoint);
7996         }
7997         else if (info == XLOG_CHECKPOINT_ONLINE)
7998         {
7999                 CheckPoint      checkPoint;
8000
8001                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8002                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8003                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8004                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8005                                                                   checkPoint.nextXid))
8006                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8007                 LWLockRelease(XidGenLock);
8008                 /* ... but still treat OID counter as exact */
8009                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8010                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8011                 ShmemVariableCache->oidCount = 0;
8012                 LWLockRelease(OidGenLock);
8013                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8014                                                                   checkPoint.nextMultiOffset);
8015                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
8016                                                                   checkPoint.oldestXid))
8017                         SetTransactionIdLimit(checkPoint.oldestXid,
8018                                                                   checkPoint.oldestXidDB);
8019
8020                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8021                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8022                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8023
8024                 /* Update shared-memory copy of checkpoint XID/epoch */
8025                 {
8026                         /* use volatile pointer to prevent code rearrangement */
8027                         volatile XLogCtlData *xlogctl = XLogCtl;
8028
8029                         SpinLockAcquire(&xlogctl->info_lck);
8030                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8031                         xlogctl->ckptXid = checkPoint.nextXid;
8032                         SpinLockRelease(&xlogctl->info_lck);
8033                 }
8034
8035                 /* TLI should not change in an on-line checkpoint */
8036                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8037                         ereport(PANIC,
8038                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8039                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8040
8041                 RecoveryRestartPoint(&checkPoint);
8042         }
8043         else if (info == XLOG_NOOP)
8044         {
8045                 /* nothing to do here */
8046         }
8047         else if (info == XLOG_SWITCH)
8048         {
8049                 /* nothing to do here */
8050         }
8051         else if (info == XLOG_RESTORE_POINT)
8052         {
8053                 /* nothing to do here */
8054         }
8055         else if (info == XLOG_BACKUP_END)
8056         {
8057                 XLogRecPtr      startpoint;
8058
8059                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8060
8061                 if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
8062                 {
8063                         /*
8064                          * We have reached the end of base backup, the point where
8065                          * pg_stop_backup() was done. The data on disk is now consistent.
8066                          * Reset backupStartPoint, and update minRecoveryPoint to make
8067                          * sure we don't allow starting up at an earlier point even if
8068                          * recovery is stopped and restarted soon after this.
8069                          */
8070                         elog(DEBUG1, "end of backup reached");
8071
8072                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8073
8074                         if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
8075                         {
8076                                 ControlFile->minRecoveryPoint = lsn;
8077                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8078                         }
8079                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
8080                         ControlFile->backupEndRequired = false;
8081                         UpdateControlFile();
8082
8083                         LWLockRelease(ControlFileLock);
8084                 }
8085         }
8086         else if (info == XLOG_PARAMETER_CHANGE)
8087         {
8088                 xl_parameter_change xlrec;
8089
8090                 /* Update our copy of the parameters in pg_control */
8091                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8092
8093                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8094                 ControlFile->MaxConnections = xlrec.MaxConnections;
8095                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8096                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8097                 ControlFile->wal_level = xlrec.wal_level;
8098
8099                 /*
8100                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8101                  * recover back up to this point before allowing hot standby again.
8102                  * This is particularly important if wal_level was set to 'archive'
8103                  * before, and is now 'hot_standby', to ensure you don't run queries
8104                  * against the WAL preceding the wal_level change. Same applies to
8105                  * decreasing max_* settings.
8106                  */
8107                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8108                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8109                 if (minRecoveryPoint != 0 && XLByteLT(minRecoveryPoint, lsn))
8110                 {
8111                         ControlFile->minRecoveryPoint = lsn;
8112                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8113                 }
8114
8115                 UpdateControlFile();
8116                 LWLockRelease(ControlFileLock);
8117
8118                 /* Check to see if any changes to max_connections give problems */
8119                 CheckRequiredParameterValues();
8120         }
8121         else if (info == XLOG_FPW_CHANGE)
8122         {
8123                 /* use volatile pointer to prevent code rearrangement */
8124                 volatile XLogCtlData *xlogctl = XLogCtl;
8125                 bool            fpw;
8126
8127                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8128
8129                 /*
8130                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8131                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
8132                  * full_page_writes has been disabled during online backup.
8133                  */
8134                 if (!fpw)
8135                 {
8136                         SpinLockAcquire(&xlogctl->info_lck);
8137                         if (XLByteLT(xlogctl->lastFpwDisableRecPtr, ReadRecPtr))
8138                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
8139                         SpinLockRelease(&xlogctl->info_lck);
8140                 }
8141
8142                 /* Keep track of full_page_writes */
8143                 lastFullPageWrites = fpw;
8144         }
8145 }
8146
8147 #ifdef WAL_DEBUG
8148
8149 static void
8150 xlog_outrec(StringInfo buf, XLogRecord *record)
8151 {
8152         int                     i;
8153
8154         appendStringInfo(buf, "prev %X/%X; xid %u",
8155                                          (uint32) (record->xl_prev >> 32),
8156                                          (uint32) record->xl_prev,
8157                                          record->xl_xid);
8158
8159         appendStringInfo(buf, "; len %u",
8160                                          record->xl_len);
8161
8162         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8163         {
8164                 if (record->xl_info & XLR_BKP_BLOCK(i))
8165                         appendStringInfo(buf, "; bkpb%d", i);
8166         }
8167
8168         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8169 }
8170 #endif   /* WAL_DEBUG */
8171
8172
8173 /*
8174  * Return the (possible) sync flag used for opening a file, depending on the
8175  * value of the GUC wal_sync_method.
8176  */
8177 static int
8178 get_sync_bit(int method)
8179 {
8180         int                     o_direct_flag = 0;
8181
8182         /* If fsync is disabled, never open in sync mode */
8183         if (!enableFsync)
8184                 return 0;
8185
8186         /*
8187          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8188          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
8189          * disabled, otherwise the archive command or walsender process will read
8190          * the WAL soon after writing it, which is guaranteed to cause a physical
8191          * read if we bypassed the kernel cache. We also skip the
8192          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8193          * reason.
8194          *
8195          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8196          * written by walreceiver is normally read by the startup process soon
8197          * after its written. Also, walreceiver performs unaligned writes, which
8198          * don't work with O_DIRECT, so it is required for correctness too.
8199          */
8200         if (!XLogIsNeeded() && !AmWalReceiverProcess())
8201                 o_direct_flag = PG_O_DIRECT;
8202
8203         switch (method)
8204         {
8205                         /*
8206                          * enum values for all sync options are defined even if they are
8207                          * not supported on the current platform.  But if not, they are
8208                          * not included in the enum option array, and therefore will never
8209                          * be seen here.
8210                          */
8211                 case SYNC_METHOD_FSYNC:
8212                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8213                 case SYNC_METHOD_FDATASYNC:
8214                         return 0;
8215 #ifdef OPEN_SYNC_FLAG
8216                 case SYNC_METHOD_OPEN:
8217                         return OPEN_SYNC_FLAG | o_direct_flag;
8218 #endif
8219 #ifdef OPEN_DATASYNC_FLAG
8220                 case SYNC_METHOD_OPEN_DSYNC:
8221                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8222 #endif
8223                 default:
8224                         /* can't happen (unless we are out of sync with option array) */
8225                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8226                         return 0;                       /* silence warning */
8227         }
8228 }
8229
8230 /*
8231  * GUC support
8232  */
8233 void
8234 assign_xlog_sync_method(int new_sync_method, void *extra)
8235 {
8236         if (sync_method != new_sync_method)
8237         {
8238                 /*
8239                  * To ensure that no blocks escape unsynced, force an fsync on the
8240                  * currently open log segment (if any).  Also, if the open flag is
8241                  * changing, close the log file so it will be reopened (with new flag
8242                  * bit) at next use.
8243                  */
8244                 if (openLogFile >= 0)
8245                 {
8246                         if (pg_fsync(openLogFile) != 0)
8247                                 ereport(PANIC,
8248                                                 (errcode_for_file_access(),
8249                                                  errmsg("could not fsync log segment %s: %m",
8250                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
8251                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8252                                 XLogFileClose();
8253                 }
8254         }
8255 }
8256
8257
8258 /*
8259  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8260  *
8261  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8262  * 'log' and 'seg' are for error reporting purposes.
8263  */
8264 void
8265 issue_xlog_fsync(int fd, XLogSegNo segno)
8266 {
8267         switch (sync_method)
8268         {
8269                 case SYNC_METHOD_FSYNC:
8270                         if (pg_fsync_no_writethrough(fd) != 0)
8271                                 ereport(PANIC,
8272                                                 (errcode_for_file_access(),
8273                                                  errmsg("could not fsync log file %s: %m",
8274                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8275                         break;
8276 #ifdef HAVE_FSYNC_WRITETHROUGH
8277                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8278                         if (pg_fsync_writethrough(fd) != 0)
8279                                 ereport(PANIC,
8280                                                 (errcode_for_file_access(),
8281                                                  errmsg("could not fsync write-through log file %s: %m",
8282                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8283                         break;
8284 #endif
8285 #ifdef HAVE_FDATASYNC
8286                 case SYNC_METHOD_FDATASYNC:
8287                         if (pg_fdatasync(fd) != 0)
8288                                 ereport(PANIC,
8289                                                 (errcode_for_file_access(),
8290                                                  errmsg("could not fdatasync log file %s: %m",
8291                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8292                         break;
8293 #endif
8294                 case SYNC_METHOD_OPEN:
8295                 case SYNC_METHOD_OPEN_DSYNC:
8296                         /* write synced it already */
8297                         break;
8298                 default:
8299                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8300                         break;
8301         }
8302 }
8303
8304 /*
8305  * Return the filename of given log segment, as a palloc'd string.
8306  */
8307 char *
8308 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
8309 {
8310         char       *result = palloc(MAXFNAMELEN);
8311         XLogFileName(result, tli, segno);
8312         return result;
8313 }
8314
8315 /*
8316  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
8317  * function. It creates the necessary starting checkpoint and constructs the
8318  * backup label file.
8319  *
8320  * There are two kind of backups: exclusive and non-exclusive. An exclusive
8321  * backup is started with pg_start_backup(), and there can be only one active
8322  * at a time. The backup label file of an exclusive backup is written to
8323  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
8324  *
8325  * A non-exclusive backup is used for the streaming base backups (see
8326  * src/backend/replication/basebackup.c). The difference to exclusive backups
8327  * is that the backup label file is not written to disk. Instead, its would-be
8328  * contents are returned in *labelfile, and the caller is responsible for
8329  * including it in the backup archive as 'backup_label'. There can be many
8330  * non-exclusive backups active at the same time, and they don't conflict
8331  * with an exclusive backup either.
8332  *
8333  * Every successfully started non-exclusive backup must be stopped by calling
8334  * do_pg_stop_backup() or do_pg_abort_backup().
8335  */
8336 XLogRecPtr
8337 do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
8338 {
8339         bool            exclusive = (labelfile == NULL);
8340         bool            backup_started_in_recovery = false;
8341         XLogRecPtr      checkpointloc;
8342         XLogRecPtr      startpoint;
8343         pg_time_t       stamp_time;
8344         char            strfbuf[128];
8345         char            xlogfilename[MAXFNAMELEN];
8346         XLogSegNo       _logSegNo;
8347         struct stat stat_buf;
8348         FILE       *fp;
8349         StringInfoData labelfbuf;
8350
8351         backup_started_in_recovery = RecoveryInProgress();
8352
8353         if (!superuser() && !is_authenticated_user_replication_role())
8354                 ereport(ERROR,
8355                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8356                    errmsg("must be superuser or replication role to run a backup")));
8357
8358         /*
8359          * Currently only non-exclusive backup can be taken during recovery.
8360          */
8361         if (backup_started_in_recovery && exclusive)
8362                 ereport(ERROR,
8363                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8364                                  errmsg("recovery is in progress"),
8365                                  errhint("WAL control functions cannot be executed during recovery.")));
8366
8367         /*
8368          * During recovery, we don't need to check WAL level. Because, if WAL
8369          * level is not sufficient, it's impossible to get here during recovery.
8370          */
8371         if (!backup_started_in_recovery && !XLogIsNeeded())
8372                 ereport(ERROR,
8373                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8374                           errmsg("WAL level not sufficient for making an online backup"),
8375                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8376
8377         if (strlen(backupidstr) > MAXPGPATH)
8378                 ereport(ERROR,
8379                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8380                                  errmsg("backup label too long (max %d bytes)",
8381                                                 MAXPGPATH)));
8382
8383         /*
8384          * Mark backup active in shared memory.  We must do full-page WAL writes
8385          * during an on-line backup even if not doing so at other times, because
8386          * it's quite possible for the backup dump to obtain a "torn" (partially
8387          * written) copy of a database page if it reads the page concurrently with
8388          * our write to the same page.  This can be fixed as long as the first
8389          * write to the page in the WAL sequence is a full-page write. Hence, we
8390          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
8391          * are no dirty pages in shared memory that might get dumped while the
8392          * backup is in progress without having a corresponding WAL record.  (Once
8393          * the backup is complete, we need not force full-page writes anymore,
8394          * since we expect that any pages not modified during the backup interval
8395          * must have been correctly captured by the backup.)
8396          *
8397          * Note that forcePageWrites has no effect during an online backup from
8398          * the standby.
8399          *
8400          * We must hold WALInsertLock to change the value of forcePageWrites, to
8401          * ensure adequate interlocking against XLogInsert().
8402          */
8403         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8404         if (exclusive)
8405         {
8406                 if (XLogCtl->Insert.exclusiveBackup)
8407                 {
8408                         LWLockRelease(WALInsertLock);
8409                         ereport(ERROR,
8410                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8411                                          errmsg("a backup is already in progress"),
8412                                          errhint("Run pg_stop_backup() and try again.")));
8413                 }
8414                 XLogCtl->Insert.exclusiveBackup = true;
8415         }
8416         else
8417                 XLogCtl->Insert.nonExclusiveBackups++;
8418         XLogCtl->Insert.forcePageWrites = true;
8419         LWLockRelease(WALInsertLock);
8420
8421         /* Ensure we release forcePageWrites if fail below */
8422         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8423         {
8424                 bool            gotUniqueStartpoint = false;
8425
8426                 /*
8427                  * Force an XLOG file switch before the checkpoint, to ensure that the
8428                  * WAL segment the checkpoint is written to doesn't contain pages with
8429                  * old timeline IDs.  That would otherwise happen if you called
8430                  * pg_start_backup() right after restoring from a PITR archive: the
8431                  * first WAL segment containing the startup checkpoint has pages in
8432                  * the beginning with the old timeline ID.      That can cause trouble at
8433                  * recovery: we won't have a history file covering the old timeline if
8434                  * pg_xlog directory was not included in the base backup and the WAL
8435                  * archive was cleared too before starting the backup.
8436                  *
8437                  * This also ensures that we have emitted a WAL page header that has
8438                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
8439                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
8440                  * compress out removable backup blocks, it won't remove any that
8441                  * occur after this point.
8442                  *
8443                  * During recovery, we skip forcing XLOG file switch, which means that
8444                  * the backup taken during recovery is not available for the special
8445                  * recovery case described above.
8446                  */
8447                 if (!backup_started_in_recovery)
8448                         RequestXLogSwitch();
8449
8450                 do
8451                 {
8452                         bool            checkpointfpw;
8453
8454                         /*
8455                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
8456                          * page problems, this guarantees that two successive backup runs
8457                          * will have different checkpoint positions and hence different
8458                          * history file names, even if nothing happened in between.
8459                          *
8460                          * During recovery, establish a restartpoint if possible. We use
8461                          * the last restartpoint as the backup starting checkpoint. This
8462                          * means that two successive backup runs can have same checkpoint
8463                          * positions.
8464                          *
8465                          * Since the fact that we are executing do_pg_start_backup()
8466                          * during recovery means that checkpointer is running, we can use
8467                          * RequestCheckpoint() to establish a restartpoint.
8468                          *
8469                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
8470                          * passing fast = true).  Otherwise this can take awhile.
8471                          */
8472                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8473                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
8474
8475                         /*
8476                          * Now we need to fetch the checkpoint record location, and also
8477                          * its REDO pointer.  The oldest point in WAL that would be needed
8478                          * to restore starting from the checkpoint is precisely the REDO
8479                          * pointer.
8480                          */
8481                         LWLockAcquire(ControlFileLock, LW_SHARED);
8482                         checkpointloc = ControlFile->checkPoint;
8483                         startpoint = ControlFile->checkPointCopy.redo;
8484                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
8485                         LWLockRelease(ControlFileLock);
8486
8487                         if (backup_started_in_recovery)
8488                         {
8489                                 /* use volatile pointer to prevent code rearrangement */
8490                                 volatile XLogCtlData *xlogctl = XLogCtl;
8491                                 XLogRecPtr      recptr;
8492
8493                                 /*
8494                                  * Check to see if all WAL replayed during online backup
8495                                  * (i.e., since last restartpoint used as backup starting
8496                                  * checkpoint) contain full-page writes.
8497                                  */
8498                                 SpinLockAcquire(&xlogctl->info_lck);
8499                                 recptr = xlogctl->lastFpwDisableRecPtr;
8500                                 SpinLockRelease(&xlogctl->info_lck);
8501
8502                                 if (!checkpointfpw || XLByteLE(startpoint, recptr))
8503                                         ereport(ERROR,
8504                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8505                                                    errmsg("WAL generated with full_page_writes=off was replayed "
8506                                                                   "since last restartpoint"),
8507                                                    errhint("This means that the backup being taken on standby "
8508                                                                    "is corrupt and should not be used. "
8509                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
8510                                                                    "and then try an online backup again.")));
8511
8512                                 /*
8513                                  * During recovery, since we don't use the end-of-backup WAL
8514                                  * record and don't write the backup history file, the
8515                                  * starting WAL location doesn't need to be unique. This means
8516                                  * that two base backups started at the same time might use
8517                                  * the same checkpoint as starting locations.
8518                                  */
8519                                 gotUniqueStartpoint = true;
8520                         }
8521
8522                         /*
8523                          * If two base backups are started at the same time (in WAL sender
8524                          * processes), we need to make sure that they use different
8525                          * checkpoints as starting locations, because we use the starting
8526                          * WAL location as a unique identifier for the base backup in the
8527                          * end-of-backup WAL record and when we write the backup history
8528                          * file. Perhaps it would be better generate a separate unique ID
8529                          * for each backup instead of forcing another checkpoint, but
8530                          * taking a checkpoint right after another is not that expensive
8531                          * either because only few buffers have been dirtied yet.
8532                          */
8533                         LWLockAcquire(WALInsertLock, LW_SHARED);
8534                         if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
8535                         {
8536                                 XLogCtl->Insert.lastBackupStart = startpoint;
8537                                 gotUniqueStartpoint = true;
8538                         }
8539                         LWLockRelease(WALInsertLock);
8540                 } while (!gotUniqueStartpoint);
8541
8542                 XLByteToSeg(startpoint, _logSegNo);
8543                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
8544
8545                 /*
8546                  * Construct backup label file
8547                  */
8548                 initStringInfo(&labelfbuf);
8549
8550                 /* Use the log timezone here, not the session timezone */
8551                 stamp_time = (pg_time_t) time(NULL);
8552                 pg_strftime(strfbuf, sizeof(strfbuf),
8553                                         "%Y-%m-%d %H:%M:%S %Z",
8554                                         pg_localtime(&stamp_time, log_timezone));
8555                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
8556                                                  (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
8557                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
8558                                                  (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
8559                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
8560                                                  exclusive ? "pg_start_backup" : "streamed");
8561                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
8562                                                  backup_started_in_recovery ? "standby" : "master");
8563                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
8564                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
8565
8566                 /*
8567                  * Okay, write the file, or return its contents to caller.
8568                  */
8569                 if (exclusive)
8570                 {
8571                         /*
8572                          * Check for existing backup label --- implies a backup is already
8573                          * running.  (XXX given that we checked exclusiveBackup above,
8574                          * maybe it would be OK to just unlink any such label file?)
8575                          */
8576                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
8577                         {
8578                                 if (errno != ENOENT)
8579                                         ereport(ERROR,
8580                                                         (errcode_for_file_access(),
8581                                                          errmsg("could not stat file \"%s\": %m",
8582                                                                         BACKUP_LABEL_FILE)));
8583                         }
8584                         else
8585                                 ereport(ERROR,
8586                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8587                                                  errmsg("a backup is already in progress"),
8588                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
8589                                                                  BACKUP_LABEL_FILE)));
8590
8591                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
8592
8593                         if (!fp)
8594                                 ereport(ERROR,
8595                                                 (errcode_for_file_access(),
8596                                                  errmsg("could not create file \"%s\": %m",
8597                                                                 BACKUP_LABEL_FILE)));
8598                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
8599                                 fflush(fp) != 0 ||
8600                                 pg_fsync(fileno(fp)) != 0 ||
8601                                 ferror(fp) ||
8602                                 FreeFile(fp))
8603                                 ereport(ERROR,
8604                                                 (errcode_for_file_access(),
8605                                                  errmsg("could not write file \"%s\": %m",
8606                                                                 BACKUP_LABEL_FILE)));
8607                         pfree(labelfbuf.data);
8608                 }
8609                 else
8610                         *labelfile = labelfbuf.data;
8611         }
8612         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8613
8614         /*
8615          * We're done.  As a convenience, return the starting WAL location.
8616          */
8617         return startpoint;
8618 }
8619
8620 /* Error cleanup callback for pg_start_backup */
8621 static void
8622 pg_start_backup_callback(int code, Datum arg)
8623 {
8624         bool            exclusive = DatumGetBool(arg);
8625
8626         /* Update backup counters and forcePageWrites on failure */
8627         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8628         if (exclusive)
8629         {
8630                 Assert(XLogCtl->Insert.exclusiveBackup);
8631                 XLogCtl->Insert.exclusiveBackup = false;
8632         }
8633         else
8634         {
8635                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8636                 XLogCtl->Insert.nonExclusiveBackups--;
8637         }
8638
8639         if (!XLogCtl->Insert.exclusiveBackup &&
8640                 XLogCtl->Insert.nonExclusiveBackups == 0)
8641         {
8642                 XLogCtl->Insert.forcePageWrites = false;
8643         }
8644         LWLockRelease(WALInsertLock);
8645 }
8646
8647 /*
8648  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
8649  * function.
8650
8651  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
8652  * the non-exclusive backup specified by 'labelfile'.
8653  */
8654 XLogRecPtr
8655 do_pg_stop_backup(char *labelfile, bool waitforarchive)
8656 {
8657         bool            exclusive = (labelfile == NULL);
8658         bool            backup_started_in_recovery = false;
8659         XLogRecPtr      startpoint;
8660         XLogRecPtr      stoppoint;
8661         XLogRecData rdata;
8662         pg_time_t       stamp_time;
8663         char            strfbuf[128];
8664         char            histfilepath[MAXPGPATH];
8665         char            startxlogfilename[MAXFNAMELEN];
8666         char            stopxlogfilename[MAXFNAMELEN];
8667         char            lastxlogfilename[MAXFNAMELEN];
8668         char            histfilename[MAXFNAMELEN];
8669         char            backupfrom[20];
8670         XLogSegNo       _logSegNo;
8671         FILE       *lfp;
8672         FILE       *fp;
8673         char            ch;
8674         int                     seconds_before_warning;
8675         int                     waits = 0;
8676         bool            reported_waiting = false;
8677         char       *remaining;
8678         char       *ptr;
8679         uint32          hi,
8680                                 lo;
8681
8682         backup_started_in_recovery = RecoveryInProgress();
8683
8684         if (!superuser() && !is_authenticated_user_replication_role())
8685                 ereport(ERROR,
8686                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8687                  (errmsg("must be superuser or replication role to run a backup"))));
8688
8689         /*
8690          * Currently only non-exclusive backup can be taken during recovery.
8691          */
8692         if (backup_started_in_recovery && exclusive)
8693                 ereport(ERROR,
8694                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8695                                  errmsg("recovery is in progress"),
8696                                  errhint("WAL control functions cannot be executed during recovery.")));
8697
8698         /*
8699          * During recovery, we don't need to check WAL level. Because, if WAL
8700          * level is not sufficient, it's impossible to get here during recovery.
8701          */
8702         if (!backup_started_in_recovery && !XLogIsNeeded())
8703                 ereport(ERROR,
8704                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8705                           errmsg("WAL level not sufficient for making an online backup"),
8706                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8707
8708         /*
8709          * OK to update backup counters and forcePageWrites
8710          */
8711         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8712         if (exclusive)
8713                 XLogCtl->Insert.exclusiveBackup = false;
8714         else
8715         {
8716                 /*
8717                  * The user-visible pg_start/stop_backup() functions that operate on
8718                  * exclusive backups can be called at any time, but for non-exclusive
8719                  * backups, it is expected that each do_pg_start_backup() call is
8720                  * matched by exactly one do_pg_stop_backup() call.
8721                  */
8722                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8723                 XLogCtl->Insert.nonExclusiveBackups--;
8724         }
8725
8726         if (!XLogCtl->Insert.exclusiveBackup &&
8727                 XLogCtl->Insert.nonExclusiveBackups == 0)
8728         {
8729                 XLogCtl->Insert.forcePageWrites = false;
8730         }
8731         LWLockRelease(WALInsertLock);
8732
8733         if (exclusive)
8734         {
8735                 /*
8736                  * Read the existing label file into memory.
8737                  */
8738                 struct stat statbuf;
8739                 int                     r;
8740
8741                 if (stat(BACKUP_LABEL_FILE, &statbuf))
8742                 {
8743                         if (errno != ENOENT)
8744                                 ereport(ERROR,
8745                                                 (errcode_for_file_access(),
8746                                                  errmsg("could not stat file \"%s\": %m",
8747                                                                 BACKUP_LABEL_FILE)));
8748                         ereport(ERROR,
8749                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8750                                          errmsg("a backup is not in progress")));
8751                 }
8752
8753                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8754                 if (!lfp)
8755                 {
8756                         ereport(ERROR,
8757                                         (errcode_for_file_access(),
8758                                          errmsg("could not read file \"%s\": %m",
8759                                                         BACKUP_LABEL_FILE)));
8760                 }
8761                 labelfile = palloc(statbuf.st_size + 1);
8762                 r = fread(labelfile, statbuf.st_size, 1, lfp);
8763                 labelfile[statbuf.st_size] = '\0';
8764
8765                 /*
8766                  * Close and remove the backup label file
8767                  */
8768                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
8769                         ereport(ERROR,
8770                                         (errcode_for_file_access(),
8771                                          errmsg("could not read file \"%s\": %m",
8772                                                         BACKUP_LABEL_FILE)));
8773                 if (unlink(BACKUP_LABEL_FILE) != 0)
8774                         ereport(ERROR,
8775                                         (errcode_for_file_access(),
8776                                          errmsg("could not remove file \"%s\": %m",
8777                                                         BACKUP_LABEL_FILE)));
8778         }
8779
8780         /*
8781          * Read and parse the START WAL LOCATION line (this code is pretty crude,
8782          * but we are not expecting any variability in the file format).
8783          */
8784         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
8785                            &hi, &lo, startxlogfilename,
8786                            &ch) != 4 || ch != '\n')
8787                 ereport(ERROR,
8788                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8789                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8790         startpoint = ((uint64) hi) << 32 | lo;
8791         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
8792
8793         /*
8794          * Parse the BACKUP FROM line. If we are taking an online backup from the
8795          * standby, we confirm that the standby has not been promoted during the
8796          * backup.
8797          */
8798         ptr = strstr(remaining, "BACKUP FROM:");
8799         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
8800                 ereport(ERROR,
8801                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8802                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8803         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
8804                 ereport(ERROR,
8805                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8806                                  errmsg("the standby was promoted during online backup"),
8807                                  errhint("This means that the backup being taken is corrupt "
8808                                                  "and should not be used. "
8809                                                  "Try taking another online backup.")));
8810
8811         /*
8812          * During recovery, we don't write an end-of-backup record. We assume that
8813          * pg_control was backed up last and its minimum recovery point can be
8814          * available as the backup end location. Since we don't have an
8815          * end-of-backup record, we use the pg_control value to check whether
8816          * we've reached the end of backup when starting recovery from this
8817          * backup. We have no way of checking if pg_control wasn't backed up last
8818          * however.
8819          *
8820          * We don't force a switch to new WAL file and wait for all the required
8821          * files to be archived. This is okay if we use the backup to start the
8822          * standby. But, if it's for an archive recovery, to ensure all the
8823          * required files are available, a user should wait for them to be
8824          * archived, or include them into the backup.
8825          *
8826          * We return the current minimum recovery point as the backup end
8827          * location. Note that it can be greater than the exact backup end
8828          * location if the minimum recovery point is updated after the backup of
8829          * pg_control. This is harmless for current uses.
8830          *
8831          * XXX currently a backup history file is for informational and debug
8832          * purposes only. It's not essential for an online backup. Furthermore,
8833          * even if it's created, it will not be archived during recovery because
8834          * an archiver is not invoked. So it doesn't seem worthwhile to write a
8835          * backup history file during recovery.
8836          */
8837         if (backup_started_in_recovery)
8838         {
8839                 /* use volatile pointer to prevent code rearrangement */
8840                 volatile XLogCtlData *xlogctl = XLogCtl;
8841                 XLogRecPtr      recptr;
8842
8843                 /*
8844                  * Check to see if all WAL replayed during online backup contain
8845                  * full-page writes.
8846                  */
8847                 SpinLockAcquire(&xlogctl->info_lck);
8848                 recptr = xlogctl->lastFpwDisableRecPtr;
8849                 SpinLockRelease(&xlogctl->info_lck);
8850
8851                 if (XLByteLE(startpoint, recptr))
8852                         ereport(ERROR,
8853                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8854                            errmsg("WAL generated with full_page_writes=off was replayed "
8855                                           "during online backup"),
8856                                  errhint("This means that the backup being taken on standby "
8857                                                  "is corrupt and should not be used. "
8858                                  "Enable full_page_writes and run CHECKPOINT on the master, "
8859                                                  "and then try an online backup again.")));
8860
8861
8862                 LWLockAcquire(ControlFileLock, LW_SHARED);
8863                 stoppoint = ControlFile->minRecoveryPoint;
8864                 LWLockRelease(ControlFileLock);
8865
8866                 return stoppoint;
8867         }
8868
8869         /*
8870          * Write the backup-end xlog record
8871          */
8872         rdata.data = (char *) (&startpoint);
8873         rdata.len = sizeof(startpoint);
8874         rdata.buffer = InvalidBuffer;
8875         rdata.next = NULL;
8876         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
8877
8878         /*
8879          * Force a switch to a new xlog segment file, so that the backup is valid
8880          * as soon as archiver moves out the current segment file.
8881          */
8882         RequestXLogSwitch();
8883
8884         XLByteToPrevSeg(stoppoint, _logSegNo);
8885         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
8886
8887         /* Use the log timezone here, not the session timezone */
8888         stamp_time = (pg_time_t) time(NULL);
8889         pg_strftime(strfbuf, sizeof(strfbuf),
8890                                 "%Y-%m-%d %H:%M:%S %Z",
8891                                 pg_localtime(&stamp_time, log_timezone));
8892
8893         /*
8894          * Write the backup history file
8895          */
8896         XLByteToSeg(startpoint, _logSegNo);
8897         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
8898                                                   (uint32) (startpoint % XLogSegSize));
8899         fp = AllocateFile(histfilepath, "w");
8900         if (!fp)
8901                 ereport(ERROR,
8902                                 (errcode_for_file_access(),
8903                                  errmsg("could not create file \"%s\": %m",
8904                                                 histfilepath)));
8905         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
8906                         (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
8907         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
8908                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
8909         /* transfer remaining lines from label to history file */
8910         fprintf(fp, "%s", remaining);
8911         fprintf(fp, "STOP TIME: %s\n", strfbuf);
8912         if (fflush(fp) || ferror(fp) || FreeFile(fp))
8913                 ereport(ERROR,
8914                                 (errcode_for_file_access(),
8915                                  errmsg("could not write file \"%s\": %m",
8916                                                 histfilepath)));
8917
8918         /*
8919          * Clean out any no-longer-needed history files.  As a side effect, this
8920          * will post a .ready file for the newly created history file, notifying
8921          * the archiver that history file may be archived immediately.
8922          */
8923         CleanupBackupHistory();
8924
8925         /*
8926          * If archiving is enabled, wait for all the required WAL files to be
8927          * archived before returning. If archiving isn't enabled, the required WAL
8928          * needs to be transported via streaming replication (hopefully with
8929          * wal_keep_segments set high enough), or some more exotic mechanism like
8930          * polling and copying files from pg_xlog with script. We have no
8931          * knowledge of those mechanisms, so it's up to the user to ensure that he
8932          * gets all the required WAL.
8933          *
8934          * We wait until both the last WAL file filled during backup and the
8935          * history file have been archived, and assume that the alphabetic sorting
8936          * property of the WAL files ensures any earlier WAL files are safely
8937          * archived as well.
8938          *
8939          * We wait forever, since archive_command is supposed to work and we
8940          * assume the admin wanted his backup to work completely. If you don't
8941          * wish to wait, you can set statement_timeout.  Also, some notices are
8942          * issued to clue in anyone who might be doing this interactively.
8943          */
8944         if (waitforarchive && XLogArchivingActive())
8945         {
8946                 XLByteToPrevSeg(stoppoint, _logSegNo);
8947                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
8948
8949                 XLByteToSeg(startpoint, _logSegNo);
8950                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
8951                                                           (uint32) (startpoint % XLogSegSize));
8952
8953                 seconds_before_warning = 60;
8954                 waits = 0;
8955
8956                 while (XLogArchiveIsBusy(lastxlogfilename) ||
8957                            XLogArchiveIsBusy(histfilename))
8958                 {
8959                         CHECK_FOR_INTERRUPTS();
8960
8961                         if (!reported_waiting && waits > 5)
8962                         {
8963                                 ereport(NOTICE,
8964                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
8965                                 reported_waiting = true;
8966                         }
8967
8968                         pg_usleep(1000000L);
8969
8970                         if (++waits >= seconds_before_warning)
8971                         {
8972                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
8973                                 ereport(WARNING,
8974                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
8975                                                                 waits),
8976                                                  errhint("Check that your archive_command is executing properly.  "
8977                                                                  "pg_stop_backup can be canceled safely, "
8978                                                                  "but the database backup will not be usable without all the WAL segments.")));
8979                         }
8980                 }
8981
8982                 ereport(NOTICE,
8983                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
8984         }
8985         else if (waitforarchive)
8986                 ereport(NOTICE,
8987                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
8988
8989         /*
8990          * We're done.  As a convenience, return the ending WAL location.
8991          */
8992         return stoppoint;
8993 }
8994
8995
8996 /*
8997  * do_pg_abort_backup: abort a running backup
8998  *
8999  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9000  * system out of backup mode, thus making it a lot more safe to call from
9001  * an error handler.
9002  *
9003  * NB: This is only for aborting a non-exclusive backup that doesn't write
9004  * backup_label. A backup started with pg_stop_backup() needs to be finished
9005  * with pg_stop_backup().
9006  */
9007 void
9008 do_pg_abort_backup(void)
9009 {
9010         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9011         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9012         XLogCtl->Insert.nonExclusiveBackups--;
9013
9014         if (!XLogCtl->Insert.exclusiveBackup &&
9015                 XLogCtl->Insert.nonExclusiveBackups == 0)
9016         {
9017                 XLogCtl->Insert.forcePageWrites = false;
9018         }
9019         LWLockRelease(WALInsertLock);
9020 }
9021
9022 /*
9023  * Get latest redo apply position.
9024  *
9025  * Optionally, returns the current recovery target timeline. Callers not
9026  * interested in that may pass NULL for targetTLI.
9027  *
9028  * Exported to allow WALReceiver to read the pointer directly.
9029  */
9030 XLogRecPtr
9031 GetXLogReplayRecPtr(TimeLineID *targetTLI)
9032 {
9033         /* use volatile pointer to prevent code rearrangement */
9034         volatile XLogCtlData *xlogctl = XLogCtl;
9035         XLogRecPtr      recptr;
9036
9037         SpinLockAcquire(&xlogctl->info_lck);
9038         recptr = xlogctl->recoveryLastRecPtr;
9039         if (targetTLI)
9040                 *targetTLI = xlogctl->RecoveryTargetTLI;
9041         SpinLockRelease(&xlogctl->info_lck);
9042
9043         return recptr;
9044 }
9045
9046 /*
9047  * Get current standby flush position, ie, the last WAL position
9048  * known to be fsync'd to disk in standby.
9049  *
9050  * If 'targetTLI' is not NULL, it's set to the current recovery target
9051  * timeline.
9052  */
9053 XLogRecPtr
9054 GetStandbyFlushRecPtr(TimeLineID *targetTLI)
9055 {
9056         XLogRecPtr      receivePtr;
9057         XLogRecPtr      replayPtr;
9058
9059         receivePtr = GetWalRcvWriteRecPtr(NULL);
9060         replayPtr = GetXLogReplayRecPtr(targetTLI);
9061
9062         if (XLByteLT(receivePtr, replayPtr))
9063                 return replayPtr;
9064         else
9065                 return receivePtr;
9066 }
9067
9068 /*
9069  * Get latest WAL insert pointer
9070  */
9071 XLogRecPtr
9072 GetXLogInsertRecPtr(void)
9073 {
9074         XLogCtlInsert *Insert = &XLogCtl->Insert;
9075         XLogRecPtr      current_recptr;
9076
9077         LWLockAcquire(WALInsertLock, LW_SHARED);
9078         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
9079         LWLockRelease(WALInsertLock);
9080
9081         return current_recptr;
9082 }
9083
9084 /*
9085  * Get latest WAL write pointer
9086  */
9087 XLogRecPtr
9088 GetXLogWriteRecPtr(void)
9089 {
9090         {
9091                 /* use volatile pointer to prevent code rearrangement */
9092                 volatile XLogCtlData *xlogctl = XLogCtl;
9093
9094                 SpinLockAcquire(&xlogctl->info_lck);
9095                 LogwrtResult = xlogctl->LogwrtResult;
9096                 SpinLockRelease(&xlogctl->info_lck);
9097         }
9098
9099         return LogwrtResult.Write;
9100 }
9101
9102 /*
9103  * Returns the redo pointer of the last checkpoint or restartpoint. This is
9104  * the oldest point in WAL that we still need, if we have to restart recovery.
9105  */
9106 void
9107 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9108 {
9109         LWLockAcquire(ControlFileLock, LW_SHARED);
9110         *oldrecptr = ControlFile->checkPointCopy.redo;
9111         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9112         LWLockRelease(ControlFileLock);
9113 }
9114
9115 /*
9116  * read_backup_label: check to see if a backup_label file is present
9117  *
9118  * If we see a backup_label during recovery, we assume that we are recovering
9119  * from a backup dump file, and we therefore roll forward from the checkpoint
9120  * identified by the label file, NOT what pg_control says.      This avoids the
9121  * problem that pg_control might have been archived one or more checkpoints
9122  * later than the start of the dump, and so if we rely on it as the start
9123  * point, we will fail to restore a consistent database state.
9124  *
9125  * Returns TRUE if a backup_label was found (and fills the checkpoint
9126  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9127  * respectively); returns FALSE if not. If this backup_label came from a
9128  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9129  * was created during recovery, *backupFromStandby is set to TRUE.
9130  */
9131 static bool
9132 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9133                                   bool *backupFromStandby)
9134 {
9135         char            startxlogfilename[MAXFNAMELEN];
9136         TimeLineID      tli;
9137         FILE       *lfp;
9138         char            ch;
9139         char            backuptype[20];
9140         char            backupfrom[20];
9141         uint32          hi,
9142                                 lo;
9143
9144         *backupEndRequired = false;
9145         *backupFromStandby = false;
9146
9147         /*
9148          * See if label file is present
9149          */
9150         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9151         if (!lfp)
9152         {
9153                 if (errno != ENOENT)
9154                         ereport(FATAL,
9155                                         (errcode_for_file_access(),
9156                                          errmsg("could not read file \"%s\": %m",
9157                                                         BACKUP_LABEL_FILE)));
9158                 return false;                   /* it's not there, all is fine */
9159         }
9160
9161         /*
9162          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9163          * is pretty crude, but we are not expecting any variability in the file
9164          * format).
9165          */
9166         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9167                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
9168                 ereport(FATAL,
9169                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9170                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9171         RedoStartLSN = ((uint64) hi) << 32 | lo;
9172         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9173                            &hi, &lo, &ch) != 3 || ch != '\n')
9174                 ereport(FATAL,
9175                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9176                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9177         *checkPointLoc = ((uint64) hi) << 32 | lo;
9178
9179         /*
9180          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
9181          * from an older backup anyway, but since the information on it is not
9182          * strictly required, don't error out if it's missing for some reason.
9183          */
9184         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
9185         {
9186                 if (strcmp(backuptype, "streamed") == 0)
9187                         *backupEndRequired = true;
9188         }
9189
9190         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
9191         {
9192                 if (strcmp(backupfrom, "standby") == 0)
9193                         *backupFromStandby = true;
9194         }
9195
9196         if (ferror(lfp) || FreeFile(lfp))
9197                 ereport(FATAL,
9198                                 (errcode_for_file_access(),
9199                                  errmsg("could not read file \"%s\": %m",
9200                                                 BACKUP_LABEL_FILE)));
9201
9202         return true;
9203 }
9204
9205 /*
9206  * Error context callback for errors occurring during rm_redo().
9207  */
9208 static void
9209 rm_redo_error_callback(void *arg)
9210 {
9211         XLogRecord *record = (XLogRecord *) arg;
9212         StringInfoData buf;
9213
9214         initStringInfo(&buf);
9215         RmgrTable[record->xl_rmid].rm_desc(&buf,
9216                                                                            record->xl_info,
9217                                                                            XLogRecGetData(record));
9218
9219         /* don't bother emitting empty description */
9220         if (buf.len > 0)
9221                 errcontext("xlog redo %s", buf.data);
9222
9223         pfree(buf.data);
9224 }
9225
9226 /*
9227  * BackupInProgress: check if online backup mode is active
9228  *
9229  * This is done by checking for existence of the "backup_label" file.
9230  */
9231 bool
9232 BackupInProgress(void)
9233 {
9234         struct stat stat_buf;
9235
9236         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9237 }
9238
9239 /*
9240  * CancelBackup: rename the "backup_label" file to cancel backup mode
9241  *
9242  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9243  * Note that this will render an online backup in progress useless.
9244  * To correctly finish an online backup, pg_stop_backup must be called.
9245  */
9246 void
9247 CancelBackup(void)
9248 {
9249         struct stat stat_buf;
9250
9251         /* if the file is not there, return */
9252         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9253                 return;
9254
9255         /* remove leftover file from previously canceled backup if it exists */
9256         unlink(BACKUP_LABEL_OLD);
9257
9258         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9259         {
9260                 ereport(LOG,
9261                                 (errmsg("online backup mode canceled"),
9262                                  errdetail("\"%s\" was renamed to \"%s\".",
9263                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9264         }
9265         else
9266         {
9267                 ereport(WARNING,
9268                                 (errcode_for_file_access(),
9269                                  errmsg("online backup mode was not canceled"),
9270                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
9271                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9272         }
9273 }
9274
9275 /*
9276  * Read the XLOG page containing RecPtr into readBuf (if not read already).
9277  * Returns true if the page is read successfully.
9278  *
9279  * This is responsible for restoring files from archive as needed, as well
9280  * as for waiting for the requested WAL record to arrive in standby mode.
9281  *
9282  * 'emode' specifies the log level used for reporting "file not found" or
9283  * "end of WAL" situations in archive recovery, or in standby mode when a
9284  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
9285  * false in those situations, on higher log levels the ereport() won't
9286  * return.
9287  *
9288  * In standby mode, if after a successful return of XLogPageRead() the
9289  * caller finds the record it's interested in to be broken, it should
9290  * ereport the error with the level determined by
9291  * emode_for_corrupt_record(), and then set lastSourceFailed
9292  * and call XLogPageRead() again with the same arguments. This lets
9293  * XLogPageRead() to try fetching the record from another source, or to
9294  * sleep and retry.
9295  */
9296 static bool
9297 XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
9298                          bool randAccess)
9299 {
9300         uint32          targetPageOff;
9301         uint32          targetRecOff;
9302         XLogSegNo       targetSegNo;
9303
9304         XLByteToSeg(*RecPtr, targetSegNo);
9305         targetPageOff = (((*RecPtr) % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
9306         targetRecOff = (*RecPtr) % XLOG_BLCKSZ;
9307
9308         /* Fast exit if we have read the record in the current buffer already */
9309         if (!lastSourceFailed && targetSegNo == readSegNo &&
9310                 targetPageOff == readOff && targetRecOff < readLen)
9311                 return true;
9312
9313         /*
9314          * See if we need to switch to a new segment because the requested record
9315          * is not in the currently open one.
9316          */
9317         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readSegNo))
9318         {
9319                 /*
9320                  * Request a restartpoint if we've replayed too much xlog since the
9321                  * last one.
9322                  */
9323                 if (StandbyMode && bgwriterLaunched)
9324                 {
9325                         if (XLogCheckpointNeeded(readSegNo))
9326                         {
9327                                 (void) GetRedoRecPtr();
9328                                 if (XLogCheckpointNeeded(readSegNo))
9329                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
9330                         }
9331                 }
9332
9333                 close(readFile);
9334                 readFile = -1;
9335                 readSource = 0;
9336         }
9337
9338         XLByteToSeg(*RecPtr, readSegNo);
9339
9340 retry:
9341         /* See if we need to retrieve more data */
9342         if (readFile < 0 ||
9343                 (readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
9344         {
9345                 if (StandbyMode)
9346                 {
9347                         if (!WaitForWALToBecomeAvailable(*RecPtr, randAccess,
9348                                                                                          fetching_ckpt))
9349                                 goto triggered;
9350                 }
9351                 else
9352                 {
9353                         /* In archive or crash recovery. */
9354                         if (readFile < 0)
9355                         {
9356                                 int                     source;
9357
9358                                 /* Reset curFileTLI if random fetch. */
9359                                 if (randAccess)
9360                                         curFileTLI = 0;
9361
9362                                 if (InArchiveRecovery)
9363                                         source = XLOG_FROM_ANY;
9364                                 else
9365                                         source = XLOG_FROM_PG_XLOG;
9366
9367                                 readFile = XLogFileReadAnyTLI(readSegNo, emode, source);
9368                                 if (readFile < 0)
9369                                         return false;
9370                         }
9371                 }
9372         }
9373
9374         /*
9375          * At this point, we have the right segment open and if we're streaming we
9376          * know the requested record is in it.
9377          */
9378         Assert(readFile != -1);
9379
9380         /*
9381          * If the current segment is being streamed from master, calculate how
9382          * much of the current page we have received already. We know the
9383          * requested record has been received, but this is for the benefit of
9384          * future calls, to allow quick exit at the top of this function.
9385          */
9386         if (readSource == XLOG_FROM_STREAM)
9387         {
9388                 if (((*RecPtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
9389                 {
9390                         readLen = XLOG_BLCKSZ;
9391                 }
9392                 else
9393                         readLen = receivedUpto % XLogSegSize - targetPageOff;
9394         }
9395         else
9396                 readLen = XLOG_BLCKSZ;
9397
9398         if (!readFileHeaderValidated && targetPageOff != 0)
9399         {
9400                 /*
9401                  * Whenever switching to a new WAL segment, we read the first page of
9402                  * the file and validate its header, even if that's not where the
9403                  * target record is.  This is so that we can check the additional
9404                  * identification info that is present in the first page's "long"
9405                  * header.
9406                  */
9407                 readOff = 0;
9408                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9409                 {
9410                         char fname[MAXFNAMELEN];
9411                         XLogFileName(fname, curFileTLI, readSegNo);
9412                         ereport(emode_for_corrupt_record(emode, *RecPtr),
9413                                         (errcode_for_file_access(),
9414                                          errmsg("could not read from log segment %s, offset %u: %m",
9415                                                         fname, readOff)));
9416                         goto next_record_is_invalid;
9417                 }
9418                 if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode, true))
9419                         goto next_record_is_invalid;
9420         }
9421
9422         /* Read the requested page */
9423         readOff = targetPageOff;
9424         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
9425         {
9426                 char fname[MAXFNAMELEN];
9427                 XLogFileName(fname, curFileTLI, readSegNo);
9428                 ereport(emode_for_corrupt_record(emode, *RecPtr),
9429                                 (errcode_for_file_access(),
9430                  errmsg("could not seek in log segment %s to offset %u: %m",
9431                                 fname, readOff)));
9432                 goto next_record_is_invalid;
9433         }
9434         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9435         {
9436                 char fname[MAXFNAMELEN];
9437                 XLogFileName(fname, curFileTLI, readSegNo);
9438                 ereport(emode_for_corrupt_record(emode, *RecPtr),
9439                                 (errcode_for_file_access(),
9440                  errmsg("could not read from log segment %s, offset %u: %m",
9441                                 fname, readOff)));
9442                 goto next_record_is_invalid;
9443         }
9444         if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode, false))
9445                 goto next_record_is_invalid;
9446
9447         readFileHeaderValidated = true;
9448
9449         Assert(targetSegNo == readSegNo);
9450         Assert(targetPageOff == readOff);
9451         Assert(targetRecOff < readLen);
9452
9453         return true;
9454
9455 next_record_is_invalid:
9456         lastSourceFailed = true;
9457
9458         if (readFile >= 0)
9459                 close(readFile);
9460         readFile = -1;
9461         readLen = 0;
9462         readSource = 0;
9463
9464         /* In standby-mode, keep trying */
9465         if (StandbyMode)
9466                 goto retry;
9467         else
9468                 return false;
9469
9470 triggered:
9471         if (readFile >= 0)
9472                 close(readFile);
9473         readFile = -1;
9474         readLen = 0;
9475         readSource = 0;
9476
9477         return false;
9478 }
9479
9480 /*
9481  * In standby mode, wait for the requested record to become available, either
9482  * via restore_command succeeding to restore the segment, or via walreceiver
9483  * having streamed the record (or via someone copying the segment directly to
9484  * pg_xlog, but that is not documented or recommended).
9485  *
9486  * When the requested record becomes available, the function opens the file
9487  * containing it (if not open already), and returns true. When end of standby
9488  * mode is triggered by the user, and there is no more WAL available, returns
9489  * false.
9490  */
9491 static bool
9492 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
9493                                                         bool fetching_ckpt)
9494 {
9495         static pg_time_t last_fail_time = 0;
9496         pg_time_t now;
9497
9498         /*-------
9499          * Standby mode is implemented by a state machine:
9500          *
9501          * 1. Read from archive (XLOG_FROM_ARCHIVE)
9502          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
9503          * 3. Check trigger file
9504          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
9505          * 5. Rescan timelines
9506          * 6. Sleep 5 seconds, and loop back to 1.
9507          *
9508          * Failure to read from the current source advances the state machine to
9509          * the next state. In addition, successfully reading a file from pg_xlog
9510          * moves the state machine from state 2 back to state 1 (we always prefer
9511          * files in the archive over files in pg_xlog).
9512          *
9513          * 'currentSource' indicates the current state. There are no currentSource
9514          * values for "check trigger", "rescan timelines", and "sleep" states,
9515          * those actions are taken when reading from the previous source fails, as
9516          * part of advancing to the next state.
9517          *-------
9518          */
9519         if (currentSource == 0)
9520                 currentSource = XLOG_FROM_ARCHIVE;
9521
9522         for (;;)
9523         {
9524                 int             oldSource = currentSource;
9525
9526                 /*
9527                  * First check if we failed to read from the current source, and
9528                  * advance the state machine if so. The failure to read might've
9529                  * happened outside this function, e.g when a CRC check fails on a
9530                  * record, or within this loop.
9531                  */
9532                 if (lastSourceFailed)
9533                 {
9534
9535                         switch (currentSource)
9536                         {
9537                                 case XLOG_FROM_ARCHIVE:
9538                                         currentSource = XLOG_FROM_PG_XLOG;
9539                                         break;
9540
9541                                 case XLOG_FROM_PG_XLOG:
9542                                         /*
9543                                          * Check to see if the trigger file exists. Note that we do
9544                                          * this only after failure, so when you create the trigger
9545                                          * file, we still finish replaying as much as we can from
9546                                          * archive and pg_xlog before failover.
9547                                          */
9548                                         if (CheckForStandbyTrigger())
9549                                                 return false;
9550
9551                                         /*
9552                                          * If primary_conninfo is set, launch walreceiver to try to
9553                                          * stream the missing WAL.
9554                                          *
9555                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
9556                                          * checkpoint location. In that case, we use RedoStartLSN
9557                                          * as the streaming start position instead of RecPtr, so
9558                                          * that when we later jump backwards to start redo at
9559                                          * RedoStartLSN, we will have the logs streamed already.
9560                                          */
9561                                         if (PrimaryConnInfo)
9562                                         {
9563                                                 XLogRecPtr ptr = fetching_ckpt ? RedoStartLSN : RecPtr;
9564
9565                                                 RequestXLogStreaming(ptr, PrimaryConnInfo);
9566                                         }
9567                                         /*
9568                                          * Move to XLOG_FROM_STREAM state in either case. We'll get
9569                                          * immediate failure if we didn't launch walreceiver, and
9570                                          * move on to the next state.
9571                                          */
9572                                         currentSource = XLOG_FROM_STREAM;
9573                                         break;
9574
9575                                 case XLOG_FROM_STREAM:
9576                                         /*
9577                                          * Failure while streaming. Most likely, we got here because
9578                                          * streaming replication was terminated, or promotion was
9579                                          * triggered. But we also get here if we find an invalid
9580                                          * record in the WAL streamed from master, in which case
9581                                          * something is seriously wrong. There's little chance that
9582                                          * the problem will just go away, but PANIC is not good for
9583                                          * availability either, especially in hot standby mode. So,
9584                                          * we treat that the same as disconnection, and retry from
9585                                          * archive/pg_xlog again. The WAL in the archive should be
9586                                          * identical to what was streamed, so it's unlikely that it
9587                                          * helps, but one can hope...
9588                                          */
9589                                         /*
9590                                          * Before we leave XLOG_FROM_STREAM state, make sure that
9591                                          * walreceiver is not running, so that it won't overwrite
9592                                          * any WAL that we restore from archive.
9593                                          */
9594                                         if (WalRcvInProgress())
9595                                                 ShutdownWalRcv();
9596
9597                                         /*
9598                                          * Before we sleep, re-scan for possible new timelines if
9599                                          * we were requested to recover to the latest timeline.
9600                                          */
9601                                         if (recoveryTargetIsLatest)
9602                                         {
9603                                                 if (rescanLatestTimeLine())
9604                                                 {
9605                                                         currentSource = XLOG_FROM_ARCHIVE;
9606                                                         break;
9607                                                 }
9608                                         }
9609
9610                                         /*
9611                                          * XLOG_FROM_STREAM is the last state in our state machine,
9612                                          * so we've exhausted all the options for obtaining the
9613                                          * requested WAL. We're going to loop back and retry from
9614                                          * the archive, but if it hasn't been long since last
9615                                          * attempt, sleep 5 seconds to avoid busy-waiting.
9616                                          */
9617                                         now = (pg_time_t) time(NULL);
9618                                         if ((now - last_fail_time) < 5)
9619                                         {
9620                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
9621                                                 now = (pg_time_t) time(NULL);
9622                                         }
9623                                         last_fail_time = now;
9624                                         currentSource = XLOG_FROM_ARCHIVE;
9625                                         break;
9626
9627                                 default:
9628                                         elog(ERROR, "unexpected WAL source %d", currentSource);
9629                         }
9630                 }
9631                 else if (currentSource == XLOG_FROM_PG_XLOG)
9632                 {
9633                         /*
9634                          * We just successfully read a file in pg_xlog. We prefer files
9635                          * in the archive over ones in pg_xlog, so try the next file
9636                          * again from the archive first.
9637                          */
9638                         currentSource = XLOG_FROM_ARCHIVE;
9639                 }
9640
9641                 if (currentSource != oldSource)
9642                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
9643                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
9644                                  lastSourceFailed ? "failure" : "success");
9645
9646                 /*
9647                  * We've now handled possible failure. Try to read from the chosen
9648                  * source.
9649                  */
9650                 lastSourceFailed = false;
9651
9652                 switch (currentSource)
9653                 {
9654                         case XLOG_FROM_ARCHIVE:
9655                         case XLOG_FROM_PG_XLOG:
9656                                 /* Close any old file we might have open. */
9657                                 if (readFile >= 0)
9658                                 {
9659                                         close(readFile);
9660                                         readFile = -1;
9661                                 }
9662                                 /* Reset curFileTLI if random fetch. */
9663                                 if (randAccess)
9664                                         curFileTLI = 0;
9665
9666                                 /*
9667                                  * Try to restore the file from archive, or read an existing
9668                                  * file from pg_xlog.
9669                                  */
9670                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
9671                                 if (readFile >= 0)
9672                                         return true;    /* success! */
9673
9674                                 /*
9675                                  * Nope, not found in archive or pg_xlog.
9676                                  */
9677                                 lastSourceFailed = true;
9678                                 break;
9679
9680                         case XLOG_FROM_STREAM:
9681                         {
9682                                 bool            havedata;
9683
9684                                 /*
9685                                  * Check if WAL receiver is still active.
9686                                  */
9687                                 if (!WalRcvInProgress())
9688                                 {
9689                                         lastSourceFailed = true;
9690                                         break;
9691                                 }
9692
9693                                 /*
9694                                  * Walreceiver is active, so see if new data has arrived.
9695                                  *
9696                                  * We only advance XLogReceiptTime when we obtain fresh WAL
9697                                  * from walreceiver and observe that we had already processed
9698                                  * everything before the most recent "chunk" that it flushed to
9699                                  * disk.  In steady state where we are keeping up with the
9700                                  * incoming data, XLogReceiptTime will be updated on each cycle.
9701                                  * When we are behind, XLogReceiptTime will not advance, so the
9702                                  * grace time allotted to conflicting queries will decrease.
9703                                  */
9704                                 if (XLByteLT(RecPtr, receivedUpto))
9705                                         havedata = true;
9706                                 else
9707                                 {
9708                                         XLogRecPtr      latestChunkStart;
9709
9710                                         receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
9711                                         if (XLByteLT(RecPtr, receivedUpto))
9712                                         {
9713                                                 havedata = true;
9714                                                 if (!XLByteLT(RecPtr, latestChunkStart))
9715                                                 {
9716                                                         XLogReceiptTime = GetCurrentTimestamp();
9717                                                         SetCurrentChunkStartTime(XLogReceiptTime);
9718                                                 }
9719                                         }
9720                                         else
9721                                                 havedata = false;
9722                                 }
9723                                 if (havedata)
9724                                 {
9725                                         /*
9726                                          * Great, streamed far enough.  Open the file if it's not
9727                                          * open already.  Use XLOG_FROM_STREAM so that source info
9728                                          * is set correctly and XLogReceiptTime isn't changed.
9729                                          */
9730                                         if (readFile < 0)
9731                                         {
9732                                                 readFile = XLogFileRead(readSegNo, PANIC,
9733                                                                                                 recoveryTargetTLI,
9734                                                                                                 XLOG_FROM_STREAM, false);
9735                                                 Assert(readFile >= 0);
9736                                         }
9737                                         else
9738                                         {
9739                                                 /* just make sure source info is correct... */
9740                                                 readSource = XLOG_FROM_STREAM;
9741                                                 XLogReceiptSource = XLOG_FROM_STREAM;
9742                                                 return true;
9743                                         }
9744                                         break;
9745                                 }
9746
9747                                 /*
9748                                  * Data not here yet. Check for trigger, then wait for
9749                                  * walreceiver to wake us up when new WAL arrives.
9750                                  */
9751                                 if (CheckForStandbyTrigger())
9752                                 {
9753                                         /*
9754                                          * Note that we don't "return false" immediately here.
9755                                          * After being triggered, we still want to replay all the
9756                                          * WAL that was already streamed. It's in pg_xlog now, so
9757                                          * we just treat this as a failure, and the state machine
9758                                          * will move on to replay the streamed WAL from pg_xlog,
9759                                          * and then recheck the trigger and exit replay.
9760                                          */
9761                                         lastSourceFailed = true;
9762                                         break;
9763                                 }
9764
9765                                 /*
9766                                  * Wait for more WAL to arrive. Time out after 5 seconds, like
9767                                  * when polling the archive, to react to a trigger file
9768                                  * promptly.
9769                                  */
9770                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
9771                                                   WL_LATCH_SET | WL_TIMEOUT,
9772                                                   5000L);
9773                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
9774                                 break;
9775                         }
9776
9777                         default:
9778                                 elog(ERROR, "unexpected WAL source %d", currentSource);
9779                 }
9780
9781                 /*
9782                  * This possibly-long loop needs to handle interrupts of startup
9783                  * process.
9784                  */
9785                 HandleStartupProcInterrupts();
9786         }
9787
9788         return false;   /* not reached */
9789 }
9790
9791 /*
9792  * Determine what log level should be used to report a corrupt WAL record
9793  * in the current WAL page, previously read by XLogPageRead().
9794  *
9795  * 'emode' is the error mode that would be used to report a file-not-found
9796  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
9797  * we're retrying the exact same record that we've tried previously, only
9798  * complain the first time to keep the noise down.      However, we only do when
9799  * reading from pg_xlog, because we don't expect any invalid records in archive
9800  * or in records streamed from master. Files in the archive should be complete,
9801  * and we should never hit the end of WAL because we stop and wait for more WAL
9802  * to arrive before replaying it.
9803  *
9804  * NOTE: This function remembers the RecPtr value it was last called with,
9805  * to suppress repeated messages about the same record. Only call this when
9806  * you are about to ereport(), or you might cause a later message to be
9807  * erroneously suppressed.
9808  */
9809 static int
9810 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
9811 {
9812         static XLogRecPtr lastComplaint = 0;
9813
9814         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
9815         {
9816                 if (XLByteEQ(RecPtr, lastComplaint))
9817                         emode = DEBUG1;
9818                 else
9819                         lastComplaint = RecPtr;
9820         }
9821         return emode;
9822 }
9823
9824 /*
9825  * Check to see whether the user-specified trigger file exists and whether a
9826  * promote request has arrived.  If either condition holds, request postmaster
9827  * to shut down walreceiver, wait for it to exit, and return true.
9828  */
9829 static bool
9830 CheckForStandbyTrigger(void)
9831 {
9832         struct stat stat_buf;
9833         static bool triggered = false;
9834
9835         if (triggered)
9836                 return true;
9837
9838         if (IsPromoteTriggered())
9839         {
9840                 ereport(LOG,
9841                                 (errmsg("received promote request")));
9842                 ShutdownWalRcv();
9843                 ResetPromoteTriggered();
9844                 triggered = true;
9845                 return true;
9846         }
9847
9848         if (TriggerFile == NULL)
9849                 return false;
9850
9851         if (stat(TriggerFile, &stat_buf) == 0)
9852         {
9853                 ereport(LOG,
9854                                 (errmsg("trigger file found: %s", TriggerFile)));
9855                 ShutdownWalRcv();
9856                 unlink(TriggerFile);
9857                 triggered = true;
9858                 return true;
9859         }
9860         return false;
9861 }
9862
9863 /*
9864  * Check to see if a promote request has arrived. Should be
9865  * called by postmaster after receiving SIGUSR1.
9866  */
9867 bool
9868 CheckPromoteSignal(void)
9869 {
9870         struct stat stat_buf;
9871
9872         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9873         {
9874                 /*
9875                  * Since we are in a signal handler, it's not safe to elog. We
9876                  * silently ignore any error from unlink.
9877                  */
9878                 unlink(PROMOTE_SIGNAL_FILE);
9879                 return true;
9880         }
9881         return false;
9882 }
9883
9884 /*
9885  * Wake up startup process to replay newly arrived WAL, or to notice that
9886  * failover has been requested.
9887  */
9888 void
9889 WakeupRecovery(void)
9890 {
9891         SetLatch(&XLogCtl->recoveryWakeupLatch);
9892 }
9893
9894 /*
9895  * Update the WalWriterSleeping flag.
9896  */
9897 void
9898 SetWalWriterSleeping(bool sleeping)
9899 {
9900         /* use volatile pointer to prevent code rearrangement */
9901         volatile XLogCtlData *xlogctl = XLogCtl;
9902
9903         SpinLockAcquire(&xlogctl->info_lck);
9904         xlogctl->WalWriterSleeping = sleeping;
9905         SpinLockRelease(&xlogctl->info_lck);
9906 }