granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.350 2009/08/31 02:23:22 tgl Exp $
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "catalog/pg_type.h"
  39 #include "funcapi.h"
  40 #include "libpq/pqsignal.h"
  41 #include "miscadmin.h"
  42 #include "pgstat.h"
  43 #include "postmaster/bgwriter.h"
  44 #include "storage/bufmgr.h"
  45 #include "storage/fd.h"
  46 #include "storage/ipc.h"
  47 #include "storage/pmsignal.h"
  48 #include "storage/procarray.h"
  49 #include "storage/smgr.h"
  50 #include "storage/spin.h"
  51 #include "utils/builtins.h"
  52 #include "utils/flatfiles.h"
  53 #include "utils/guc.h"
  54 #include "utils/ps_status.h"
  55 #include "pg_trace.h"
  56
  57
  58 /* File path names (all relative to $PGDATA) */
  59 #define BACKUP_LABEL_FILE               "backup_label"
  60 #define BACKUP_LABEL_OLD                "backup_label.old"
  61 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  62 #define RECOVERY_COMMAND_DONE   "recovery.done"
  63
  64
  65 /* User-settable parameters */
  66 int                     CheckPointSegments = 3;
  67 int                     XLOGbuffers = 8;
  68 int                     XLogArchiveTimeout = 0;
  69 bool            XLogArchiveMode = false;
  70 char       *XLogArchiveCommand = NULL;
  71 bool            fullPageWrites = true;
  72 bool            log_checkpoints = false;
  73 int                     sync_method = DEFAULT_SYNC_METHOD;
  74
  75 #ifdef WAL_DEBUG
  76 bool            XLOG_DEBUG = false;
  77 #endif
  78
  79 /*
  80  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  81  * When we are done with an old XLOG segment file, we will recycle it as a
  82  * future XLOG segment as long as there aren't already XLOGfileslop future
  83  * segments; else we'll delete it.  This could be made a separate GUC
  84  * variable, but at present I think it's sufficient to hardwire it as
  85  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  86  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  87  * of them; the +1 allows boundary cases to happen without wasting a
  88  * delete/create-segment cycle.
  89  */
  90 #define XLOGfileslop    (2*CheckPointSegments + 1)
  91
  92 /*
  93  * GUC support
  94  */
  95 const struct config_enum_entry sync_method_options[] = {
  96         {"fsync", SYNC_METHOD_FSYNC, false},
  97 #ifdef HAVE_FSYNC_WRITETHROUGH
  98         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
  99 #endif
 100 #ifdef HAVE_FDATASYNC
 101         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 102 #endif
 103 #ifdef OPEN_SYNC_FLAG
 104         {"open_sync", SYNC_METHOD_OPEN, false},
 105 #endif
 106 #ifdef OPEN_DATASYNC_FLAG
 107         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 108 #endif
 109         {NULL, 0, false}
 110 };
 111
 112 /*
 113  * Statistics for current checkpoint are collected in this global struct.
 114  * Because only the background writer or a stand-alone backend can perform
 115  * checkpoints, this will be unused in normal backends.
 116  */
 117 CheckpointStatsData CheckpointStats;
 118
 119 /*
 120  * ThisTimeLineID will be same in all backends --- it identifies current
 121  * WAL timeline for the database system.
 122  */
 123 TimeLineID      ThisTimeLineID = 0;
 124
 125 /*
 126  * Are we doing recovery from XLOG?
 127  *
 128  * This is only ever true in the startup process; it should be read as meaning
 129  * "this process is replaying WAL records", rather than "the system is in
 130  * recovery mode".  It should be examined primarily by functions that need
 131  * to act differently when called from a WAL redo function (e.g., to skip WAL
 132  * logging).  To check whether the system is in recovery regardless of which
 133  * process you're running in, use RecoveryInProgress().
 134  */
 135 bool            InRecovery = false;
 136
 137 /*
 138  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 139  * known, need to check the shared state".
 140  */
 141 static bool LocalRecoveryInProgress = true;
 142
 143 /*
 144  * Local state for XLogInsertAllowed():
 145  *              1: unconditionally allowed to insert XLOG
 146  *              0: unconditionally not allowed to insert XLOG
 147  *              -1: must check RecoveryInProgress(); disallow until it is false
 148  * Most processes start with -1 and transition to 1 after seeing that recovery
 149  * is not in progress.  But we can also force the value for special cases.
 150  * The coding in XLogInsertAllowed() depends on the first two of these states
 151  * being numerically the same as bool true and false.
 152  */
 153 static int      LocalXLogInsertAllowed = -1;
 154
 155 /* Are we recovering using offline XLOG archives? */
 156 static bool InArchiveRecovery = false;
 157
 158 /* Was the last xlog file restored from archive, or local? */
 159 static bool restoredFromArchive = false;
 160
 161 /* options taken from recovery.conf */
 162 static char *recoveryRestoreCommand = NULL;
 163 static char *recoveryEndCommand = NULL;
 164 static bool recoveryTarget = false;
 165 static bool recoveryTargetExact = false;
 166 static bool recoveryTargetInclusive = true;
 167 static TransactionId recoveryTargetXid;
 168 static TimestampTz recoveryTargetTime;
 169 static TimestampTz recoveryLastXTime = 0;
 170
 171 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 172 static TransactionId recoveryStopXid;
 173 static TimestampTz recoveryStopTime;
 174 static bool recoveryStopAfter;
 175
 176 /*
 177  * During normal operation, the only timeline we care about is ThisTimeLineID.
 178  * During recovery, however, things are more complicated.  To simplify life
 179  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 180  * scan through the WAL history (that is, it is the line that was active when
 181  * the currently-scanned WAL record was generated).  We also need these
 182  * timeline values:
 183  *
 184  * recoveryTargetTLI: the desired timeline that we want to end in.
 185  *
 186  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 187  * its known parents, newest first (so recoveryTargetTLI is always the
 188  * first list member).  Only these TLIs are expected to be seen in the WAL
 189  * segments we read, and indeed only these TLIs will be considered as
 190  * candidate WAL files to open at all.
 191  *
 192  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 193  * (This is not necessarily the same as ThisTimeLineID, because we could
 194  * be scanning data that was copied from an ancestor timeline when the current
 195  * file was created.)  During a sequential scan we do not allow this value
 196  * to decrease.
 197  */
 198 static TimeLineID recoveryTargetTLI;
 199 static List *expectedTLIs;
 200 static TimeLineID curFileTLI;
 201
 202 /*
 203  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 204  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 205  * end+1 of the last record, and is reset when we end a top-level transaction,
 206  * or start a new one; so it can be used to tell if the current transaction has
 207  * created any XLOG records.
 208  */
 209 static XLogRecPtr ProcLastRecPtr = {0, 0};
 210
 211 XLogRecPtr      XactLastRecEnd = {0, 0};
 212
 213 /*
 214  * RedoRecPtr is this backend's local copy of the REDO record pointer
 215  * (which is almost but not quite the same as a pointer to the most recent
 216  * CHECKPOINT record).  We update this from the shared-memory copy,
 217  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 218  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 219  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 220  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 221  * InitXLOGAccess.
 222  */
 223 static XLogRecPtr RedoRecPtr;
 224
 225 /*----------
 226  * Shared-memory data structures for XLOG control
 227  *
 228  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 229  * the log up to (all records before that point must be written or fsynced).
 230  * LogwrtResult indicates the byte positions we have already written/fsynced.
 231  * These structs are identical but are declared separately to indicate their
 232  * slightly different functions.
 233  *
 234  * We do a lot of pushups to minimize the amount of access to lockable
 235  * shared memory values.  There are actually three shared-memory copies of
 236  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 237  *              XLogCtl->LogwrtResult is protected by info_lck
 238  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 239  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 240  * One must hold the associated lock to read or write any of these, but
 241  * of course no lock is needed to read/write the unshared LogwrtResult.
 242  *
 243  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 244  * right", since both are updated by a write or flush operation before
 245  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 246  * is that it can be examined/modified by code that already holds WALWriteLock
 247  * without needing to grab info_lck as well.
 248  *
 249  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 250  * but is updated when convenient.      Again, it exists for the convenience of
 251  * code that is already holding WALInsertLock but not the other locks.
 252  *
 253  * The unshared LogwrtResult may lag behind any or all of these, and again
 254  * is updated when convenient.
 255  *
 256  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 257  * (protected by info_lck), but we don't need to cache any copies of it.
 258  *
 259  * Note that this all works because the request and result positions can only
 260  * advance forward, never back up, and so we can easily determine which of two
 261  * values is "more up to date".
 262  *
 263  * info_lck is only held long enough to read/update the protected variables,
 264  * so it's a plain spinlock.  The other locks are held longer (potentially
 265  * over I/O operations), so we use LWLocks for them.  These locks are:
 266  *
 267  * WALInsertLock: must be held to insert a record into the WAL buffers.
 268  *
 269  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 270  * XLogFlush).
 271  *
 272  * ControlFileLock: must be held to read/update control file or create
 273  * new log file.
 274  *
 275  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 276  * only one checkpointer at a time; currently, with all checkpoints done by
 277  * the bgwriter, this is just pro forma).
 278  *
 279  *----------
 280  */
 281
 282 typedef struct XLogwrtRqst
 283 {
 284         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 285         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 286 } XLogwrtRqst;
 287
 288 typedef struct XLogwrtResult
 289 {
 290         XLogRecPtr      Write;                  /* last byte + 1 written out */
 291         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 292 } XLogwrtResult;
 293
 294 /*
 295  * Shared state data for XLogInsert.
 296  */
 297 typedef struct XLogCtlInsert
 298 {
 299         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 300         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 301         int                     curridx;                /* current block index in cache */
 302         XLogPageHeader currpage;        /* points to header of block in cache */
 303         char       *currpos;            /* current insertion point in cache */
 304         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 305         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 306 } XLogCtlInsert;
 307
 308 /*
 309  * Shared state data for XLogWrite/XLogFlush.
 310  */
 311 typedef struct XLogCtlWrite
 312 {
 313         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 314         int                     curridx;                /* cache index of next block to write */
 315         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 316 } XLogCtlWrite;
 317
 318 /*
 319  * Total shared-memory state for XLOG.
 320  */
 321 typedef struct XLogCtlData
 322 {
 323         /* Protected by WALInsertLock: */
 324         XLogCtlInsert Insert;
 325
 326         /* Protected by info_lck: */
 327         XLogwrtRqst LogwrtRqst;
 328         XLogwrtResult LogwrtResult;
 329         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 330         TransactionId ckptXid;
 331         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
 332
 333         /* Protected by WALWriteLock: */
 334         XLogCtlWrite Write;
 335
 336         /*
 337          * These values do not change after startup, although the pointed-to pages
 338          * and xlblocks values certainly do.  Permission to read/write the pages
 339          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 340          */
 341         char       *pages;                      /* buffers for unwritten XLOG pages */
 342         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 343         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 344         TimeLineID      ThisTimeLineID;
 345
 346         /*
 347          * SharedRecoveryInProgress indicates if we're still in crash or archive
 348          * recovery.  Protected by info_lck.
 349          */
 350         bool            SharedRecoveryInProgress;
 351
 352         /*
 353          * During recovery, we keep a copy of the latest checkpoint record here.
 354          * Used by the background writer when it wants to create a restartpoint.
 355          *
 356          * Protected by info_lck.
 357          */
 358         XLogRecPtr      lastCheckPointRecPtr;
 359         CheckPoint      lastCheckPoint;
 360
 361         /* end+1 of the last record replayed (or being replayed) */
 362         XLogRecPtr      replayEndRecPtr;
 363
 364         slock_t         info_lck;               /* locks shared variables shown above */
 365 } XLogCtlData;
 366
 367 static XLogCtlData *XLogCtl = NULL;
 368
 369 /*
 370  * We maintain an image of pg_control in shared memory.
 371  */
 372 static ControlFileData *ControlFile = NULL;
 373
 374 /*
 375  * Macros for managing XLogInsert state.  In most cases, the calling routine
 376  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 377  * so these are passed as parameters instead of being fetched via XLogCtl.
 378  */
 379
 380 /* Free space remaining in the current xlog page buffer */
 381 #define INSERT_FREESPACE(Insert)  \
 382         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 383
 384 /* Construct XLogRecPtr value for current insertion point */
 385 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 386         ( \
 387           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 388           (recptr).xrecoff = \
 389                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 390         )
 391
 392 #define PrevBufIdx(idx)         \
 393                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 394
 395 #define NextBufIdx(idx)         \
 396                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 397
 398 /*
 399  * Private, possibly out-of-date copy of shared LogwrtResult.
 400  * See discussion above.
 401  */
 402 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 403
 404 /*
 405  * openLogFile is -1 or a kernel FD for an open log file segment.
 406  * When it's open, openLogOff is the current seek offset in the file.
 407  * openLogId/openLogSeg identify the segment.  These variables are only
 408  * used to write the XLOG, and so will normally refer to the active segment.
 409  */
 410 static int      openLogFile = -1;
 411 static uint32 openLogId = 0;
 412 static uint32 openLogSeg = 0;
 413 static uint32 openLogOff = 0;
 414
 415 /*
 416  * These variables are used similarly to the ones above, but for reading
 417  * the XLOG.  Note, however, that readOff generally represents the offset
 418  * of the page just read, not the seek position of the FD itself, which
 419  * will be just past that page.
 420  */
 421 static int      readFile = -1;
 422 static uint32 readId = 0;
 423 static uint32 readSeg = 0;
 424 static uint32 readOff = 0;
 425
 426 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 427 static char *readBuf = NULL;
 428
 429 /* Buffer for current ReadRecord result (expandable) */
 430 static char *readRecordBuf = NULL;
 431 static uint32 readRecordBufSize = 0;
 432
 433 /* State information for XLOG reading */
 434 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 435 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 436 static XLogRecord *nextRecord = NULL;
 437 static TimeLineID lastPageTLI = 0;
 438
 439 static XLogRecPtr minRecoveryPoint;             /* local copy of
 440                                                                                  * ControlFile->minRecoveryPoint */
 441 static bool updateMinRecoveryPoint = true;
 442
 443 static bool InRedo = false;
 444
 445 /*
 446  * Flags set by interrupt handlers for later service in the redo loop.
 447  */
 448 static volatile sig_atomic_t got_SIGHUP = false;
 449 static volatile sig_atomic_t shutdown_requested = false;
 450
 451 /*
 452  * Flag set when executing a restore command, to tell SIGTERM signal handler
 453  * that it's safe to just proc_exit.
 454  */
 455 static volatile sig_atomic_t in_restore_command = false;
 456
 457
 458 static void XLogArchiveNotify(const char *xlog);
 459 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 460 static bool XLogArchiveCheckDone(const char *xlog);
 461 static bool XLogArchiveIsBusy(const char *xlog);
 462 static void XLogArchiveCleanup(const char *xlog);
 463 static void readRecoveryCommandFile(void);
 464 static void exitArchiveRecovery(TimeLineID endTLI,
 465                                         uint32 endLogId, uint32 endLogSeg);
 466 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 467 static void LocalSetXLogInsertAllowed(void);
 468 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 469
 470 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 471                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 472 static bool AdvanceXLInsertBuffer(bool new_segment);
 473 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 474 static int XLogFileInit(uint32 log, uint32 seg,
 475                          bool *use_existent, bool use_lock);
 476 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 477                                            bool find_free, int *max_advance,
 478                                            bool use_lock);
 479 static int      XLogFileOpen(uint32 log, uint32 seg);
 480 static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 481 static void XLogFileClose(void);
 482 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 483                                         const char *recovername, off_t expectedSize);
 484 static void ExecuteRecoveryEndCommand(void);
 485 static void PreallocXlogFiles(XLogRecPtr endptr);
 486 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 487 static void ValidateXLOGDirectoryStructure(void);
 488 static void CleanupBackupHistory(void);
 489 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 490 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 491 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 492 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 493 static List *readTimeLineHistory(TimeLineID targetTLI);
 494 static bool existsTimeLineHistory(TimeLineID probeTLI);
 495 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 496 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 497                                          TimeLineID endTLI,
 498                                          uint32 endLogId, uint32 endLogSeg);
 499 static void WriteControlFile(void);
 500 static void ReadControlFile(void);
 501 static char *str_time(pg_time_t tnow);
 502
 503 #ifdef WAL_DEBUG
 504 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 505 #endif
 506 static void issue_xlog_fsync(void);
 507 static void pg_start_backup_callback(int code, Datum arg);
 508 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 509                                   XLogRecPtr *minRecoveryLoc);
 510 static void rm_redo_error_callback(void *arg);
 511 static int      get_sync_bit(int method);
 512
 513
 514 /*
 515  * Insert an XLOG record having the specified RMID and info bytes,
 516  * with the body of the record being the data chunk(s) described by
 517  * the rdata chain (see xlog.h for notes about rdata).
 518  *
 519  * Returns XLOG pointer to end of record (beginning of next record).
 520  * This can be used as LSN for data pages affected by the logged action.
 521  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 522  * before the data page can be written out.  This implements the basic
 523  * WAL rule "write the log before the data".)
 524  *
 525  * NB: this routine feels free to scribble on the XLogRecData structs,
 526  * though not on the data they reference.  This is OK since the XLogRecData
 527  * structs are always just temporaries in the calling code.
 528  */
 529 XLogRecPtr
 530 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 531 {
 532         XLogCtlInsert *Insert = &XLogCtl->Insert;
 533         XLogRecord *record;
 534         XLogContRecord *contrecord;
 535         XLogRecPtr      RecPtr;
 536         XLogRecPtr      WriteRqst;
 537         uint32          freespace;
 538         int                     curridx;
 539         XLogRecData *rdt;
 540         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 541         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 542         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 543         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 544         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 545         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 546         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 547         pg_crc32        rdata_crc;
 548         uint32          len,
 549                                 write_len;
 550         unsigned        i;
 551         bool            updrqst;
 552         bool            doPageWrites;
 553         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 554
 555         /* cross-check on whether we should be here or not */
 556         if (!XLogInsertAllowed())
 557                 elog(ERROR, "cannot make new WAL entries during recovery");
 558
 559         /* info's high bits are reserved for use by me */
 560         if (info & XLR_INFO_MASK)
 561                 elog(PANIC, "invalid xlog info mask %02X", info);
 562
 563         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 564
 565         /*
 566          * In bootstrap mode, we don't actually log anything but XLOG resources;
 567          * return a phony record pointer.
 568          */
 569         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 570         {
 571                 RecPtr.xlogid = 0;
 572                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 573                 return RecPtr;
 574         }
 575
 576         /*
 577          * Here we scan the rdata chain, determine which buffers must be backed
 578          * up, and compute the CRC values for the data.  Note that the record
 579          * header isn't added into the CRC initially since we don't know the final
 580          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 581          * the whole record in the order "rdata, then backup blocks, then record
 582          * header".
 583          *
 584          * We may have to loop back to here if a race condition is detected below.
 585          * We could prevent the race by doing all this work while holding the
 586          * insert lock, but it seems better to avoid doing CRC calculations while
 587          * holding the lock.  This means we have to be careful about modifying the
 588          * rdata chain until we know we aren't going to loop back again.  The only
 589          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 590          * chain items we have decided we will have to back up the whole buffer
 591          * for.  This is OK because we will certainly decide the same thing again
 592          * for those items if we do it over; doing it here saves an extra pass
 593          * over the chain later.
 594          */
 595 begin:;
 596         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 597         {
 598                 dtbuf[i] = InvalidBuffer;
 599                 dtbuf_bkp[i] = false;
 600         }
 601
 602         /*
 603          * Decide if we need to do full-page writes in this XLOG record: true if
 604          * full_page_writes is on or we have a PITR request for it.  Since we
 605          * don't yet have the insert lock, forcePageWrites could change under us,
 606          * but we'll recheck it once we have the lock.
 607          */
 608         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 609
 610         INIT_CRC32(rdata_crc);
 611         len = 0;
 612         for (rdt = rdata;;)
 613         {
 614                 if (rdt->buffer == InvalidBuffer)
 615                 {
 616                         /* Simple data, just include it */
 617                         len += rdt->len;
 618                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 619                 }
 620                 else
 621                 {
 622                         /* Find info for buffer */
 623                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 624                         {
 625                                 if (rdt->buffer == dtbuf[i])
 626                                 {
 627                                         /* Buffer already referenced by earlier chain item */
 628                                         if (dtbuf_bkp[i])
 629                                                 rdt->data = NULL;
 630                                         else if (rdt->data)
 631                                         {
 632                                                 len += rdt->len;
 633                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 634                                         }
 635                                         break;
 636                                 }
 637                                 if (dtbuf[i] == InvalidBuffer)
 638                                 {
 639                                         /* OK, put it in this slot */
 640                                         dtbuf[i] = rdt->buffer;
 641                                         if (XLogCheckBuffer(rdt, doPageWrites,
 642                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 643                                         {
 644                                                 dtbuf_bkp[i] = true;
 645                                                 rdt->data = NULL;
 646                                         }
 647                                         else if (rdt->data)
 648                                         {
 649                                                 len += rdt->len;
 650                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 651                                         }
 652                                         break;
 653                                 }
 654                         }
 655                         if (i >= XLR_MAX_BKP_BLOCKS)
 656                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 657                                          XLR_MAX_BKP_BLOCKS);
 658                 }
 659                 /* Break out of loop when rdt points to last chain item */
 660                 if (rdt->next == NULL)
 661                         break;
 662                 rdt = rdt->next;
 663         }
 664
 665         /*
 666          * Now add the backup block headers and data into the CRC
 667          */
 668         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 669         {
 670                 if (dtbuf_bkp[i])
 671                 {
 672                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 673                         char       *page;
 674
 675                         COMP_CRC32(rdata_crc,
 676                                            (char *) bkpb,
 677                                            sizeof(BkpBlock));
 678                         page = (char *) BufferGetBlock(dtbuf[i]);
 679                         if (bkpb->hole_length == 0)
 680                         {
 681                                 COMP_CRC32(rdata_crc,
 682                                                    page,
 683                                                    BLCKSZ);
 684                         }
 685                         else
 686                         {
 687                                 /* must skip the hole */
 688                                 COMP_CRC32(rdata_crc,
 689                                                    page,
 690                                                    bkpb->hole_offset);
 691                                 COMP_CRC32(rdata_crc,
 692                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 693                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 694                         }
 695                 }
 696         }
 697
 698         /*
 699          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 700          * error checking in ReadRecord.  This means that all callers of
 701          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 702          * make an exception for XLOG SWITCH records because we don't want them to
 703          * ever cross a segment boundary.
 704          */
 705         if (len == 0 && !isLogSwitch)
 706                 elog(PANIC, "invalid xlog record length %u", len);
 707
 708         START_CRIT_SECTION();
 709
 710         /* Now wait to get insert lock */
 711         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 712
 713         /*
 714          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 715          * back and recompute everything.  This can only happen just after a
 716          * checkpoint, so it's better to be slow in this case and fast otherwise.
 717          *
 718          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 719          * affect the contents of the XLOG record, so we'll update our local copy
 720          * but not force a recomputation.
 721          */
 722         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 723         {
 724                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 725                 RedoRecPtr = Insert->RedoRecPtr;
 726
 727                 if (doPageWrites)
 728                 {
 729                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 730                         {
 731                                 if (dtbuf[i] == InvalidBuffer)
 732                                         continue;
 733                                 if (dtbuf_bkp[i] == false &&
 734                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 735                                 {
 736                                         /*
 737                                          * Oops, this buffer now needs to be backed up, but we
 738                                          * didn't think so above.  Start over.
 739                                          */
 740                                         LWLockRelease(WALInsertLock);
 741                                         END_CRIT_SECTION();
 742                                         goto begin;
 743                                 }
 744                         }
 745                 }
 746         }
 747
 748         /*
 749          * Also check to see if forcePageWrites was just turned on; if we weren't
 750          * already doing full-page writes then go back and recompute. (If it was
 751          * just turned off, we could recompute the record without full pages, but
 752          * we choose not to bother.)
 753          */
 754         if (Insert->forcePageWrites && !doPageWrites)
 755         {
 756                 /* Oops, must redo it with full-page data */
 757                 LWLockRelease(WALInsertLock);
 758                 END_CRIT_SECTION();
 759                 goto begin;
 760         }
 761
 762         /*
 763          * Make additional rdata chain entries for the backup blocks, so that we
 764          * don't need to special-case them in the write loop.  Note that we have
 765          * now irrevocably changed the input rdata chain.  At the exit of this
 766          * loop, write_len includes the backup block data.
 767          *
 768          * Also set the appropriate info bits to show which buffers were backed
 769          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 770          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 771          */
 772         write_len = len;
 773         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 774         {
 775                 BkpBlock   *bkpb;
 776                 char       *page;
 777
 778                 if (!dtbuf_bkp[i])
 779                         continue;
 780
 781                 info |= XLR_SET_BKP_BLOCK(i);
 782
 783                 bkpb = &(dtbuf_xlg[i]);
 784                 page = (char *) BufferGetBlock(dtbuf[i]);
 785
 786                 rdt->next = &(dtbuf_rdt1[i]);
 787                 rdt = rdt->next;
 788
 789                 rdt->data = (char *) bkpb;
 790                 rdt->len = sizeof(BkpBlock);
 791                 write_len += sizeof(BkpBlock);
 792
 793                 rdt->next = &(dtbuf_rdt2[i]);
 794                 rdt = rdt->next;
 795
 796                 if (bkpb->hole_length == 0)
 797                 {
 798                         rdt->data = page;
 799                         rdt->len = BLCKSZ;
 800                         write_len += BLCKSZ;
 801                         rdt->next = NULL;
 802                 }
 803                 else
 804                 {
 805                         /* must skip the hole */
 806                         rdt->data = page;
 807                         rdt->len = bkpb->hole_offset;
 808                         write_len += bkpb->hole_offset;
 809
 810                         rdt->next = &(dtbuf_rdt3[i]);
 811                         rdt = rdt->next;
 812
 813                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 814                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 815                         write_len += rdt->len;
 816                         rdt->next = NULL;
 817                 }
 818         }
 819
 820         /*
 821          * If we backed up any full blocks and online backup is not in progress,
 822          * mark the backup blocks as removable.  This allows the WAL archiver to
 823          * know whether it is safe to compress archived WAL data by transforming
 824          * full-block records into the non-full-block format.
 825          *
 826          * Note: we could just set the flag whenever !forcePageWrites, but
 827          * defining it like this leaves the info bit free for some potential other
 828          * use in records without any backup blocks.
 829          */
 830         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 831                 info |= XLR_BKP_REMOVABLE;
 832
 833         /*
 834          * If there isn't enough space on the current XLOG page for a record
 835          * header, advance to the next page (leaving the unused space as zeroes).
 836          */
 837         updrqst = false;
 838         freespace = INSERT_FREESPACE(Insert);
 839         if (freespace < SizeOfXLogRecord)
 840         {
 841                 updrqst = AdvanceXLInsertBuffer(false);
 842                 freespace = INSERT_FREESPACE(Insert);
 843         }
 844
 845         /* Compute record's XLOG location */
 846         curridx = Insert->curridx;
 847         INSERT_RECPTR(RecPtr, Insert, curridx);
 848
 849         /*
 850          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 851          * segment, we need not insert it (and don't want to because we'd like
 852          * consecutive switch requests to be no-ops).  Instead, make sure
 853          * everything is written and flushed through the end of the prior segment,
 854          * and return the prior segment's end address.
 855          */
 856         if (isLogSwitch &&
 857                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 858         {
 859                 /* We can release insert lock immediately */
 860                 LWLockRelease(WALInsertLock);
 861
 862                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 863                 if (RecPtr.xrecoff == 0)
 864                 {
 865                         /* crossing a logid boundary */
 866                         RecPtr.xlogid -= 1;
 867                         RecPtr.xrecoff = XLogFileSize;
 868                 }
 869
 870                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 871                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 872                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 873                 {
 874                         XLogwrtRqst FlushRqst;
 875
 876                         FlushRqst.Write = RecPtr;
 877                         FlushRqst.Flush = RecPtr;
 878                         XLogWrite(FlushRqst, false, false);
 879                 }
 880                 LWLockRelease(WALWriteLock);
 881
 882                 END_CRIT_SECTION();
 883
 884                 return RecPtr;
 885         }
 886
 887         /* Insert record header */
 888
 889         record = (XLogRecord *) Insert->currpos;
 890         record->xl_prev = Insert->PrevRecord;
 891         record->xl_xid = GetCurrentTransactionIdIfAny();
 892         record->xl_tot_len = SizeOfXLogRecord + write_len;
 893         record->xl_len = len;           /* doesn't include backup blocks */
 894         record->xl_info = info;
 895         record->xl_rmid = rmid;
 896
 897         /* Now we can finish computing the record's CRC */
 898         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 899                            SizeOfXLogRecord - sizeof(pg_crc32));
 900         FIN_CRC32(rdata_crc);
 901         record->xl_crc = rdata_crc;
 902
 903 #ifdef WAL_DEBUG
 904         if (XLOG_DEBUG)
 905         {
 906                 StringInfoData buf;
 907
 908                 initStringInfo(&buf);
 909                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
 910                                                  RecPtr.xlogid, RecPtr.xrecoff);
 911                 xlog_outrec(&buf, record);
 912                 if (rdata->data != NULL)
 913                 {
 914                         appendStringInfo(&buf, " - ");
 915                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
 916                 }
 917                 elog(LOG, "%s", buf.data);
 918                 pfree(buf.data);
 919         }
 920 #endif
 921
 922         /* Record begin of record in appropriate places */
 923         ProcLastRecPtr = RecPtr;
 924         Insert->PrevRecord = RecPtr;
 925
 926         Insert->currpos += SizeOfXLogRecord;
 927         freespace -= SizeOfXLogRecord;
 928
 929         /*
 930          * Append the data, including backup blocks if any
 931          */
 932         while (write_len)
 933         {
 934                 while (rdata->data == NULL)
 935                         rdata = rdata->next;
 936
 937                 if (freespace > 0)
 938                 {
 939                         if (rdata->len > freespace)
 940                         {
 941                                 memcpy(Insert->currpos, rdata->data, freespace);
 942                                 rdata->data += freespace;
 943                                 rdata->len -= freespace;
 944                                 write_len -= freespace;
 945                         }
 946                         else
 947                         {
 948                                 memcpy(Insert->currpos, rdata->data, rdata->len);
 949                                 freespace -= rdata->len;
 950                                 write_len -= rdata->len;
 951                                 Insert->currpos += rdata->len;
 952                                 rdata = rdata->next;
 953                                 continue;
 954                         }
 955                 }
 956
 957                 /* Use next buffer */
 958                 updrqst = AdvanceXLInsertBuffer(false);
 959                 curridx = Insert->curridx;
 960                 /* Insert cont-record header */
 961                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 962                 contrecord = (XLogContRecord *) Insert->currpos;
 963                 contrecord->xl_rem_len = write_len;
 964                 Insert->currpos += SizeOfXLogContRecord;
 965                 freespace = INSERT_FREESPACE(Insert);
 966         }
 967
 968         /* Ensure next record will be properly aligned */
 969         Insert->currpos = (char *) Insert->currpage +
 970                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 971         freespace = INSERT_FREESPACE(Insert);
 972
 973         /*
 974          * The recptr I return is the beginning of the *next* record. This will be
 975          * stored as LSN for changed data pages...
 976          */
 977         INSERT_RECPTR(RecPtr, Insert, curridx);
 978
 979         /*
 980          * If the record is an XLOG_SWITCH, we must now write and flush all the
 981          * existing data, and then forcibly advance to the start of the next
 982          * segment.  It's not good to do this I/O while holding the insert lock,
 983          * but there seems too much risk of confusion if we try to release the
 984          * lock sooner.  Fortunately xlog switch needn't be a high-performance
 985          * operation anyway...
 986          */
 987         if (isLogSwitch)
 988         {
 989                 XLogCtlWrite *Write = &XLogCtl->Write;
 990                 XLogwrtRqst FlushRqst;
 991                 XLogRecPtr      OldSegEnd;
 992
 993                 TRACE_POSTGRESQL_XLOG_SWITCH();
 994
 995                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 996
 997                 /*
 998                  * Flush through the end of the page containing XLOG_SWITCH, and
 999                  * perform end-of-segment actions (eg, notifying archiver).
1000                  */
1001                 WriteRqst = XLogCtl->xlblocks[curridx];
1002                 FlushRqst.Write = WriteRqst;
1003                 FlushRqst.Flush = WriteRqst;
1004                 XLogWrite(FlushRqst, false, true);
1005
1006                 /* Set up the next buffer as first page of next segment */
1007                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1008                 (void) AdvanceXLInsertBuffer(true);
1009
1010                 /* There should be no unwritten data */
1011                 curridx = Insert->curridx;
1012                 Assert(curridx == Write->curridx);
1013
1014                 /* Compute end address of old segment */
1015                 OldSegEnd = XLogCtl->xlblocks[curridx];
1016                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
1017                 if (OldSegEnd.xrecoff == 0)
1018                 {
1019                         /* crossing a logid boundary */
1020                         OldSegEnd.xlogid -= 1;
1021                         OldSegEnd.xrecoff = XLogFileSize;
1022                 }
1023
1024                 /* Make it look like we've written and synced all of old segment */
1025                 LogwrtResult.Write = OldSegEnd;
1026                 LogwrtResult.Flush = OldSegEnd;
1027
1028                 /*
1029                  * Update shared-memory status --- this code should match XLogWrite
1030                  */
1031                 {
1032                         /* use volatile pointer to prevent code rearrangement */
1033                         volatile XLogCtlData *xlogctl = XLogCtl;
1034
1035                         SpinLockAcquire(&xlogctl->info_lck);
1036                         xlogctl->LogwrtResult = LogwrtResult;
1037                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1038                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1039                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1040                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1041                         SpinLockRelease(&xlogctl->info_lck);
1042                 }
1043
1044                 Write->LogwrtResult = LogwrtResult;
1045
1046                 LWLockRelease(WALWriteLock);
1047
1048                 updrqst = false;                /* done already */
1049         }
1050         else
1051         {
1052                 /* normal case, ie not xlog switch */
1053
1054                 /* Need to update shared LogwrtRqst if some block was filled up */
1055                 if (freespace < SizeOfXLogRecord)
1056                 {
1057                         /* curridx is filled and available for writing out */
1058                         updrqst = true;
1059                 }
1060                 else
1061                 {
1062                         /* if updrqst already set, write through end of previous buf */
1063                         curridx = PrevBufIdx(curridx);
1064                 }
1065                 WriteRqst = XLogCtl->xlblocks[curridx];
1066         }
1067
1068         LWLockRelease(WALInsertLock);
1069
1070         if (updrqst)
1071         {
1072                 /* use volatile pointer to prevent code rearrangement */
1073                 volatile XLogCtlData *xlogctl = XLogCtl;
1074
1075                 SpinLockAcquire(&xlogctl->info_lck);
1076                 /* advance global request to include new block(s) */
1077                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1078                         xlogctl->LogwrtRqst.Write = WriteRqst;
1079                 /* update local result copy while I have the chance */
1080                 LogwrtResult = xlogctl->LogwrtResult;
1081                 SpinLockRelease(&xlogctl->info_lck);
1082         }
1083
1084         XactLastRecEnd = RecPtr;
1085
1086         END_CRIT_SECTION();
1087
1088         return RecPtr;
1089 }
1090
1091 /*
1092  * Determine whether the buffer referenced by an XLogRecData item has to
1093  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1094  * save the buffer's LSN at *lsn.
1095  */
1096 static bool
1097 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1098                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1099 {
1100         Page            page;
1101
1102         page = BufferGetPage(rdata->buffer);
1103
1104         /*
1105          * XXX We assume page LSN is first data on *every* page that can be passed
1106          * to XLogInsert, whether it otherwise has the standard page layout or
1107          * not.
1108          */
1109         *lsn = PageGetLSN(page);
1110
1111         if (doPageWrites &&
1112                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1113         {
1114                 /*
1115                  * The page needs to be backed up, so set up *bkpb
1116                  */
1117                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1118
1119                 if (rdata->buffer_std)
1120                 {
1121                         /* Assume we can omit data between pd_lower and pd_upper */
1122                         uint16          lower = ((PageHeader) page)->pd_lower;
1123                         uint16          upper = ((PageHeader) page)->pd_upper;
1124
1125                         if (lower >= SizeOfPageHeaderData &&
1126                                 upper > lower &&
1127                                 upper <= BLCKSZ)
1128                         {
1129                                 bkpb->hole_offset = lower;
1130                                 bkpb->hole_length = upper - lower;
1131                         }
1132                         else
1133                         {
1134                                 /* No "hole" to compress out */
1135                                 bkpb->hole_offset = 0;
1136                                 bkpb->hole_length = 0;
1137                         }
1138                 }
1139                 else
1140                 {
1141                         /* Not a standard page header, don't try to eliminate "hole" */
1142                         bkpb->hole_offset = 0;
1143                         bkpb->hole_length = 0;
1144                 }
1145
1146                 return true;                    /* buffer requires backup */
1147         }
1148
1149         return false;                           /* buffer does not need to be backed up */
1150 }
1151
1152 /*
1153  * XLogArchiveNotify
1154  *
1155  * Create an archive notification file
1156  *
1157  * The name of the notification file is the message that will be picked up
1158  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1159  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1160  * then when complete, rename it to 0000000100000001000000C6.done
1161  */
1162 static void
1163 XLogArchiveNotify(const char *xlog)
1164 {
1165         char            archiveStatusPath[MAXPGPATH];
1166         FILE       *fd;
1167
1168         /* insert an otherwise empty file called <XLOG>.ready */
1169         StatusFilePath(archiveStatusPath, xlog, ".ready");
1170         fd = AllocateFile(archiveStatusPath, "w");
1171         if (fd == NULL)
1172         {
1173                 ereport(LOG,
1174                                 (errcode_for_file_access(),
1175                                  errmsg("could not create archive status file \"%s\": %m",
1176                                                 archiveStatusPath)));
1177                 return;
1178         }
1179         if (FreeFile(fd))
1180         {
1181                 ereport(LOG,
1182                                 (errcode_for_file_access(),
1183                                  errmsg("could not write archive status file \"%s\": %m",
1184                                                 archiveStatusPath)));
1185                 return;
1186         }
1187
1188         /* Notify archiver that it's got something to do */
1189         if (IsUnderPostmaster)
1190                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1191 }
1192
1193 /*
1194  * Convenience routine to notify using log/seg representation of filename
1195  */
1196 static void
1197 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1198 {
1199         char            xlog[MAXFNAMELEN];
1200
1201         XLogFileName(xlog, ThisTimeLineID, log, seg);
1202         XLogArchiveNotify(xlog);
1203 }
1204
1205 /*
1206  * XLogArchiveCheckDone
1207  *
1208  * This is called when we are ready to delete or recycle an old XLOG segment
1209  * file or backup history file.  If it is okay to delete it then return true.
1210  * If it is not time to delete it, make sure a .ready file exists, and return
1211  * false.
1212  *
1213  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1214  * then return false; else create <XLOG>.ready and return false.
1215  *
1216  * The reason we do things this way is so that if the original attempt to
1217  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1218  */
1219 static bool
1220 XLogArchiveCheckDone(const char *xlog)
1221 {
1222         char            archiveStatusPath[MAXPGPATH];
1223         struct stat stat_buf;
1224
1225         /* Always deletable if archiving is off */
1226         if (!XLogArchivingActive())
1227                 return true;
1228
1229         /* First check for .done --- this means archiver is done with it */
1230         StatusFilePath(archiveStatusPath, xlog, ".done");
1231         if (stat(archiveStatusPath, &stat_buf) == 0)
1232                 return true;
1233
1234         /* check for .ready --- this means archiver is still busy with it */
1235         StatusFilePath(archiveStatusPath, xlog, ".ready");
1236         if (stat(archiveStatusPath, &stat_buf) == 0)
1237                 return false;
1238
1239         /* Race condition --- maybe archiver just finished, so recheck */
1240         StatusFilePath(archiveStatusPath, xlog, ".done");
1241         if (stat(archiveStatusPath, &stat_buf) == 0)
1242                 return true;
1243
1244         /* Retry creation of the .ready file */
1245         XLogArchiveNotify(xlog);
1246         return false;
1247 }
1248
1249 /*
1250  * XLogArchiveIsBusy
1251  *
1252  * Check to see if an XLOG segment file is still unarchived.
1253  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1254  * the first place we aren't chartered to recreate the .ready file, and
1255  * in the second place we should consider that if the file is already gone
1256  * then it's not busy.  (This check is needed to handle the race condition
1257  * that a checkpoint already deleted the no-longer-needed file.)
1258  */
1259 static bool
1260 XLogArchiveIsBusy(const char *xlog)
1261 {
1262         char            archiveStatusPath[MAXPGPATH];
1263         struct stat stat_buf;
1264
1265         /* First check for .done --- this means archiver is done with it */
1266         StatusFilePath(archiveStatusPath, xlog, ".done");
1267         if (stat(archiveStatusPath, &stat_buf) == 0)
1268                 return false;
1269
1270         /* check for .ready --- this means archiver is still busy with it */
1271         StatusFilePath(archiveStatusPath, xlog, ".ready");
1272         if (stat(archiveStatusPath, &stat_buf) == 0)
1273                 return true;
1274
1275         /* Race condition --- maybe archiver just finished, so recheck */
1276         StatusFilePath(archiveStatusPath, xlog, ".done");
1277         if (stat(archiveStatusPath, &stat_buf) == 0)
1278                 return false;
1279
1280         /*
1281          * Check to see if the WAL file has been removed by checkpoint, which
1282          * implies it has already been archived, and explains why we can't see a
1283          * status file for it.
1284          */
1285         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1286         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1287                 errno == ENOENT)
1288                 return false;
1289
1290         return true;
1291 }
1292
1293 /*
1294  * XLogArchiveCleanup
1295  *
1296  * Cleanup archive notification file(s) for a particular xlog segment
1297  */
1298 static void
1299 XLogArchiveCleanup(const char *xlog)
1300 {
1301         char            archiveStatusPath[MAXPGPATH];
1302
1303         /* Remove the .done file */
1304         StatusFilePath(archiveStatusPath, xlog, ".done");
1305         unlink(archiveStatusPath);
1306         /* should we complain about failure? */
1307
1308         /* Remove the .ready file if present --- normally it shouldn't be */
1309         StatusFilePath(archiveStatusPath, xlog, ".ready");
1310         unlink(archiveStatusPath);
1311         /* should we complain about failure? */
1312 }
1313
1314 /*
1315  * Advance the Insert state to the next buffer page, writing out the next
1316  * buffer if it still contains unwritten data.
1317  *
1318  * If new_segment is TRUE then we set up the next buffer page as the first
1319  * page of the next xlog segment file, possibly but not usually the next
1320  * consecutive file page.
1321  *
1322  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1323  * just-filled page.  If we can do this for free (without an extra lock),
1324  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1325  * request update still needs to be done, FALSE if we did it internally.
1326  *
1327  * Must be called with WALInsertLock held.
1328  */
1329 static bool
1330 AdvanceXLInsertBuffer(bool new_segment)
1331 {
1332         XLogCtlInsert *Insert = &XLogCtl->Insert;
1333         XLogCtlWrite *Write = &XLogCtl->Write;
1334         int                     nextidx = NextBufIdx(Insert->curridx);
1335         bool            update_needed = true;
1336         XLogRecPtr      OldPageRqstPtr;
1337         XLogwrtRqst WriteRqst;
1338         XLogRecPtr      NewPageEndPtr;
1339         XLogPageHeader NewPage;
1340
1341         /* Use Insert->LogwrtResult copy if it's more fresh */
1342         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1343                 LogwrtResult = Insert->LogwrtResult;
1344
1345         /*
1346          * Get ending-offset of the buffer page we need to replace (this may be
1347          * zero if the buffer hasn't been used yet).  Fall through if it's already
1348          * written out.
1349          */
1350         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1351         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1352         {
1353                 /* nope, got work to do... */
1354                 XLogRecPtr      FinishedPageRqstPtr;
1355
1356                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1357
1358                 /* Before waiting, get info_lck and update LogwrtResult */
1359                 {
1360                         /* use volatile pointer to prevent code rearrangement */
1361                         volatile XLogCtlData *xlogctl = XLogCtl;
1362
1363                         SpinLockAcquire(&xlogctl->info_lck);
1364                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1365                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1366                         LogwrtResult = xlogctl->LogwrtResult;
1367                         SpinLockRelease(&xlogctl->info_lck);
1368                 }
1369
1370                 update_needed = false;  /* Did the shared-request update */
1371
1372                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1373                 {
1374                         /* OK, someone wrote it already */
1375                         Insert->LogwrtResult = LogwrtResult;
1376                 }
1377                 else
1378                 {
1379                         /* Must acquire write lock */
1380                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1381                         LogwrtResult = Write->LogwrtResult;
1382                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1383                         {
1384                                 /* OK, someone wrote it already */
1385                                 LWLockRelease(WALWriteLock);
1386                                 Insert->LogwrtResult = LogwrtResult;
1387                         }
1388                         else
1389                         {
1390                                 /*
1391                                  * Have to write buffers while holding insert lock. This is
1392                                  * not good, so only write as much as we absolutely must.
1393                                  */
1394                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1395                                 WriteRqst.Write = OldPageRqstPtr;
1396                                 WriteRqst.Flush.xlogid = 0;
1397                                 WriteRqst.Flush.xrecoff = 0;
1398                                 XLogWrite(WriteRqst, false, false);
1399                                 LWLockRelease(WALWriteLock);
1400                                 Insert->LogwrtResult = LogwrtResult;
1401                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1402                         }
1403                 }
1404         }
1405
1406         /*
1407          * Now the next buffer slot is free and we can set it up to be the next
1408          * output page.
1409          */
1410         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1411
1412         if (new_segment)
1413         {
1414                 /* force it to a segment start point */
1415                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1416                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1417         }
1418
1419         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1420         {
1421                 /* crossing a logid boundary */
1422                 NewPageEndPtr.xlogid += 1;
1423                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1424         }
1425         else
1426                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1427         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1428         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1429
1430         Insert->curridx = nextidx;
1431         Insert->currpage = NewPage;
1432
1433         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1434
1435         /*
1436          * Be sure to re-zero the buffer so that bytes beyond what we've written
1437          * will look like zeroes and not valid XLOG records...
1438          */
1439         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1440
1441         /*
1442          * Fill the new page's header
1443          */
1444         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1445
1446         /* NewPage->xlp_info = 0; */    /* done by memset */
1447         NewPage   ->xlp_tli = ThisTimeLineID;
1448         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1449         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1450
1451         /*
1452          * If first page of an XLOG segment file, make it a long header.
1453          */
1454         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1455         {
1456                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1457
1458                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1459                 NewLongPage->xlp_seg_size = XLogSegSize;
1460                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1461                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1462
1463                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1464         }
1465
1466         return update_needed;
1467 }
1468
1469 /*
1470  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1471  *
1472  * Caller must have just finished filling the open log file (so that
1473  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
1474  * to the open log file and see if that exceeds CheckPointSegments.
1475  *
1476  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1477  */
1478 static bool
1479 XLogCheckpointNeeded(void)
1480 {
1481         /*
1482          * A straight computation of segment number could overflow 32 bits. Rather
1483          * than assuming we have working 64-bit arithmetic, we compare the
1484          * highest-order bits separately, and force a checkpoint immediately when
1485          * they change.
1486          */
1487         uint32          old_segno,
1488                                 new_segno;
1489         uint32          old_highbits,
1490                                 new_highbits;
1491
1492         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1493                 (RedoRecPtr.xrecoff / XLogSegSize);
1494         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1495         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
1496         new_highbits = openLogId / XLogSegSize;
1497         if (new_highbits != old_highbits ||
1498                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1499                 return true;
1500         return false;
1501 }
1502
1503 /*
1504  * Write and/or fsync the log at least as far as WriteRqst indicates.
1505  *
1506  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1507  * may stop at any convenient boundary (such as a cache or logfile boundary).
1508  * This option allows us to avoid uselessly issuing multiple writes when a
1509  * single one would do.
1510  *
1511  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1512  * perform end-of-segment actions after writing the last page, even if
1513  * it's not physically the end of its segment.  (NB: this will work properly
1514  * only if caller specifies WriteRqst == page-end and flexible == false,
1515  * and there is some data to write.)
1516  *
1517  * Must be called with WALWriteLock held.
1518  */
1519 static void
1520 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1521 {
1522         XLogCtlWrite *Write = &XLogCtl->Write;
1523         bool            ispartialpage;
1524         bool            last_iteration;
1525         bool            finishing_seg;
1526         bool            use_existent;
1527         int                     curridx;
1528         int                     npages;
1529         int                     startidx;
1530         uint32          startoffset;
1531
1532         /* We should always be inside a critical section here */
1533         Assert(CritSectionCount > 0);
1534
1535         /*
1536          * Update local LogwrtResult (caller probably did this already, but...)
1537          */
1538         LogwrtResult = Write->LogwrtResult;
1539
1540         /*
1541          * Since successive pages in the xlog cache are consecutively allocated,
1542          * we can usually gather multiple pages together and issue just one
1543          * write() call.  npages is the number of pages we have determined can be
1544          * written together; startidx is the cache block index of the first one,
1545          * and startoffset is the file offset at which it should go. The latter
1546          * two variables are only valid when npages > 0, but we must initialize
1547          * all of them to keep the compiler quiet.
1548          */
1549         npages = 0;
1550         startidx = 0;
1551         startoffset = 0;
1552
1553         /*
1554          * Within the loop, curridx is the cache block index of the page to
1555          * consider writing.  We advance Write->curridx only after successfully
1556          * writing pages.  (Right now, this refinement is useless since we are
1557          * going to PANIC if any error occurs anyway; but someday it may come in
1558          * useful.)
1559          */
1560         curridx = Write->curridx;
1561
1562         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1563         {
1564                 /*
1565                  * Make sure we're not ahead of the insert process.  This could happen
1566                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1567                  * last page that's been initialized by AdvanceXLInsertBuffer.
1568                  */
1569                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1570                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1571                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1572                                  XLogCtl->xlblocks[curridx].xlogid,
1573                                  XLogCtl->xlblocks[curridx].xrecoff);
1574
1575                 /* Advance LogwrtResult.Write to end of current buffer page */
1576                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1577                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1578
1579                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1580                 {
1581                         /*
1582                          * Switch to new logfile segment.  We cannot have any pending
1583                          * pages here (since we dump what we have at segment end).
1584                          */
1585                         Assert(npages == 0);
1586                         if (openLogFile >= 0)
1587                                 XLogFileClose();
1588                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1589
1590                         /* create/use new log file */
1591                         use_existent = true;
1592                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1593                                                                            &use_existent, true);
1594                         openLogOff = 0;
1595                 }
1596
1597                 /* Make sure we have the current logfile open */
1598                 if (openLogFile < 0)
1599                 {
1600                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1601                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1602                         openLogOff = 0;
1603                 }
1604
1605                 /* Add current page to the set of pending pages-to-dump */
1606                 if (npages == 0)
1607                 {
1608                         /* first of group */
1609                         startidx = curridx;
1610                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1611                 }
1612                 npages++;
1613
1614                 /*
1615                  * Dump the set if this will be the last loop iteration, or if we are
1616                  * at the last page of the cache area (since the next page won't be
1617                  * contiguous in memory), or if we are at the end of the logfile
1618                  * segment.
1619                  */
1620                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1621
1622                 finishing_seg = !ispartialpage &&
1623                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1624
1625                 if (last_iteration ||
1626                         curridx == XLogCtl->XLogCacheBlck ||
1627                         finishing_seg)
1628                 {
1629                         char       *from;
1630                         Size            nbytes;
1631
1632                         /* Need to seek in the file? */
1633                         if (openLogOff != startoffset)
1634                         {
1635                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1636                                         ereport(PANIC,
1637                                                         (errcode_for_file_access(),
1638                                                          errmsg("could not seek in log file %u, "
1639                                                                         "segment %u to offset %u: %m",
1640                                                                         openLogId, openLogSeg, startoffset)));
1641                                 openLogOff = startoffset;
1642                         }
1643
1644                         /* OK to write the page(s) */
1645                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1646                         nbytes = npages * (Size) XLOG_BLCKSZ;
1647                         errno = 0;
1648                         if (write(openLogFile, from, nbytes) != nbytes)
1649                         {
1650                                 /* if write didn't set errno, assume no disk space */
1651                                 if (errno == 0)
1652                                         errno = ENOSPC;
1653                                 ereport(PANIC,
1654                                                 (errcode_for_file_access(),
1655                                                  errmsg("could not write to log file %u, segment %u "
1656                                                                 "at offset %u, length %lu: %m",
1657                                                                 openLogId, openLogSeg,
1658                                                                 openLogOff, (unsigned long) nbytes)));
1659                         }
1660
1661                         /* Update state for write */
1662                         openLogOff += nbytes;
1663                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1664                         npages = 0;
1665
1666                         /*
1667                          * If we just wrote the whole last page of a logfile segment,
1668                          * fsync the segment immediately.  This avoids having to go back
1669                          * and re-open prior segments when an fsync request comes along
1670                          * later. Doing it here ensures that one and only one backend will
1671                          * perform this fsync.
1672                          *
1673                          * We also do this if this is the last page written for an xlog
1674                          * switch.
1675                          *
1676                          * This is also the right place to notify the Archiver that the
1677                          * segment is ready to copy to archival storage, and to update the
1678                          * timer for archive_timeout, and to signal for a checkpoint if
1679                          * too many logfile segments have been used since the last
1680                          * checkpoint.
1681                          */
1682                         if (finishing_seg || (xlog_switch && last_iteration))
1683                         {
1684                                 issue_xlog_fsync();
1685                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1686
1687                                 if (XLogArchivingActive())
1688                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1689
1690                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1691
1692                                 /*
1693                                  * Signal bgwriter to start a checkpoint if we've consumed too
1694                                  * much xlog since the last one.  For speed, we first check
1695                                  * using the local copy of RedoRecPtr, which might be out of
1696                                  * date; if it looks like a checkpoint is needed, forcibly
1697                                  * update RedoRecPtr and recheck.
1698                                  */
1699                                 if (IsUnderPostmaster &&
1700                                         XLogCheckpointNeeded())
1701                                 {
1702                                         (void) GetRedoRecPtr();
1703                                         if (XLogCheckpointNeeded())
1704                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1705                                 }
1706                         }
1707                 }
1708
1709                 if (ispartialpage)
1710                 {
1711                         /* Only asked to write a partial page */
1712                         LogwrtResult.Write = WriteRqst.Write;
1713                         break;
1714                 }
1715                 curridx = NextBufIdx(curridx);
1716
1717                 /* If flexible, break out of loop as soon as we wrote something */
1718                 if (flexible && npages == 0)
1719                         break;
1720         }
1721
1722         Assert(npages == 0);
1723         Assert(curridx == Write->curridx);
1724
1725         /*
1726          * If asked to flush, do so
1727          */
1728         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1729                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1730         {
1731                 /*
1732                  * Could get here without iterating above loop, in which case we might
1733                  * have no open file or the wrong one.  However, we do not need to
1734                  * fsync more than one file.
1735                  */
1736                 if (sync_method != SYNC_METHOD_OPEN &&
1737                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1738                 {
1739                         if (openLogFile >= 0 &&
1740                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1741                                 XLogFileClose();
1742                         if (openLogFile < 0)
1743                         {
1744                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1745                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1746                                 openLogOff = 0;
1747                         }
1748                         issue_xlog_fsync();
1749                 }
1750                 LogwrtResult.Flush = LogwrtResult.Write;
1751         }
1752
1753         /*
1754          * Update shared-memory status
1755          *
1756          * We make sure that the shared 'request' values do not fall behind the
1757          * 'result' values.  This is not absolutely essential, but it saves some
1758          * code in a couple of places.
1759          */
1760         {
1761                 /* use volatile pointer to prevent code rearrangement */
1762                 volatile XLogCtlData *xlogctl = XLogCtl;
1763
1764                 SpinLockAcquire(&xlogctl->info_lck);
1765                 xlogctl->LogwrtResult = LogwrtResult;
1766                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1767                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1768                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1769                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1770                 SpinLockRelease(&xlogctl->info_lck);
1771         }
1772
1773         Write->LogwrtResult = LogwrtResult;
1774 }
1775
1776 /*
1777  * Record the LSN for an asynchronous transaction commit.
1778  * (This should not be called for aborts, nor for synchronous commits.)
1779  */
1780 void
1781 XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
1782 {
1783         /* use volatile pointer to prevent code rearrangement */
1784         volatile XLogCtlData *xlogctl = XLogCtl;
1785
1786         SpinLockAcquire(&xlogctl->info_lck);
1787         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
1788                 xlogctl->asyncCommitLSN = asyncCommitLSN;
1789         SpinLockRelease(&xlogctl->info_lck);
1790 }
1791
1792 /*
1793  * Advance minRecoveryPoint in control file.
1794  *
1795  * If we crash during recovery, we must reach this point again before the
1796  * database is consistent.
1797  *
1798  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1799  * is only updated if it's not already greater than or equal to 'lsn'.
1800  */
1801 static void
1802 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1803 {
1804         /* Quick check using our local copy of the variable */
1805         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1806                 return;
1807
1808         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1809
1810         /* update local copy */
1811         minRecoveryPoint = ControlFile->minRecoveryPoint;
1812
1813         /*
1814          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1815          * i.e., we're doing crash recovery.  We never modify the control file's
1816          * value in that case, so we can short-circuit future checks here too.
1817          */
1818         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
1819                 updateMinRecoveryPoint = false;
1820         else if (force || XLByteLT(minRecoveryPoint, lsn))
1821         {
1822                 /* use volatile pointer to prevent code rearrangement */
1823                 volatile XLogCtlData *xlogctl = XLogCtl;
1824                 XLogRecPtr      newMinRecoveryPoint;
1825
1826                 /*
1827                  * To avoid having to update the control file too often, we update it
1828                  * all the way to the last record being replayed, even though 'lsn'
1829                  * would suffice for correctness.  This also allows the 'force' case
1830                  * to not need a valid 'lsn' value.
1831                  *
1832                  * Another important reason for doing it this way is that the passed
1833                  * 'lsn' value could be bogus, i.e., past the end of available WAL,
1834                  * if the caller got it from a corrupted heap page.  Accepting such
1835                  * a value as the min recovery point would prevent us from coming up
1836                  * at all.  Instead, we just log a warning and continue with recovery.
1837                  * (See also the comments about corrupt LSNs in XLogFlush.)
1838                  */
1839                 SpinLockAcquire(&xlogctl->info_lck);
1840                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1841                 SpinLockRelease(&xlogctl->info_lck);
1842
1843                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
1844                         elog(WARNING,
1845                                  "xlog min recovery request %X/%X is past current point %X/%X",
1846                                  lsn.xlogid, lsn.xrecoff,
1847                                  newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
1848
1849                 /* update control file */
1850                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
1851                 {
1852                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1853                         UpdateControlFile();
1854                         minRecoveryPoint = newMinRecoveryPoint;
1855
1856                         ereport(DEBUG2,
1857                                         (errmsg("updated min recovery point to %X/%X",
1858                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
1859                 }
1860         }
1861         LWLockRelease(ControlFileLock);
1862 }
1863
1864 /*
1865  * Ensure that all XLOG data through the given position is flushed to disk.
1866  *
1867  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1868  * already held, and we try to avoid acquiring it if possible.
1869  */
1870 void
1871 XLogFlush(XLogRecPtr record)
1872 {
1873         XLogRecPtr      WriteRqstPtr;
1874         XLogwrtRqst WriteRqst;
1875
1876         /*
1877          * During REDO, we are reading not writing WAL.  Therefore, instead of
1878          * trying to flush the WAL, we should update minRecoveryPoint instead.
1879          * We test XLogInsertAllowed(), not InRecovery, because we need the
1880          * bgwriter to act this way too, and because when the bgwriter tries
1881          * to write the end-of-recovery checkpoint, it should indeed flush.
1882          */
1883         if (!XLogInsertAllowed())
1884         {
1885                 UpdateMinRecoveryPoint(record, false);
1886                 return;
1887         }
1888
1889         /* Quick exit if already known flushed */
1890         if (XLByteLE(record, LogwrtResult.Flush))
1891                 return;
1892
1893 #ifdef WAL_DEBUG
1894         if (XLOG_DEBUG)
1895                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1896                          record.xlogid, record.xrecoff,
1897                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1898                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1899 #endif
1900
1901         START_CRIT_SECTION();
1902
1903         /*
1904          * Since fsync is usually a horribly expensive operation, we try to
1905          * piggyback as much data as we can on each fsync: if we see any more data
1906          * entered into the xlog buffer, we'll write and fsync that too, so that
1907          * the final value of LogwrtResult.Flush is as large as possible. This
1908          * gives us some chance of avoiding another fsync immediately after.
1909          */
1910
1911         /* initialize to given target; may increase below */
1912         WriteRqstPtr = record;
1913
1914         /* read LogwrtResult and update local state */
1915         {
1916                 /* use volatile pointer to prevent code rearrangement */
1917                 volatile XLogCtlData *xlogctl = XLogCtl;
1918
1919                 SpinLockAcquire(&xlogctl->info_lck);
1920                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1921                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1922                 LogwrtResult = xlogctl->LogwrtResult;
1923                 SpinLockRelease(&xlogctl->info_lck);
1924         }
1925
1926         /* done already? */
1927         if (!XLByteLE(record, LogwrtResult.Flush))
1928         {
1929                 /* now wait for the write lock */
1930                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1931                 LogwrtResult = XLogCtl->Write.LogwrtResult;
1932                 if (!XLByteLE(record, LogwrtResult.Flush))
1933                 {
1934                         /* try to write/flush later additions to XLOG as well */
1935                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1936                         {
1937                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
1938                                 uint32          freespace = INSERT_FREESPACE(Insert);
1939
1940                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
1941                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1942                                 else
1943                                 {
1944                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1945                                         WriteRqstPtr.xrecoff -= freespace;
1946                                 }
1947                                 LWLockRelease(WALInsertLock);
1948                                 WriteRqst.Write = WriteRqstPtr;
1949                                 WriteRqst.Flush = WriteRqstPtr;
1950                         }
1951                         else
1952                         {
1953                                 WriteRqst.Write = WriteRqstPtr;
1954                                 WriteRqst.Flush = record;
1955                         }
1956                         XLogWrite(WriteRqst, false, false);
1957                 }
1958                 LWLockRelease(WALWriteLock);
1959         }
1960
1961         END_CRIT_SECTION();
1962
1963         /*
1964          * If we still haven't flushed to the request point then we have a
1965          * problem; most likely, the requested flush point is past end of XLOG.
1966          * This has been seen to occur when a disk page has a corrupted LSN.
1967          *
1968          * Formerly we treated this as a PANIC condition, but that hurts the
1969          * system's robustness rather than helping it: we do not want to take down
1970          * the whole system due to corruption on one data page.  In particular, if
1971          * the bad page is encountered again during recovery then we would be
1972          * unable to restart the database at all!  (This scenario actually
1973          * happened in the field several times with 7.1 releases.)  As of 8.4,
1974          * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
1975          * problem; the only time we can reach here during recovery is while
1976          * flushing the end-of-recovery checkpoint record, and we don't expect
1977          * that to have a bad LSN.
1978          *
1979          * Note that for calls from xact.c, the ERROR will
1980          * be promoted to PANIC since xact.c calls this routine inside a critical
1981          * section.  However, calls from bufmgr.c are not within critical sections
1982          * and so we will not force a restart for a bad LSN on a data page.
1983          */
1984         if (XLByteLT(LogwrtResult.Flush, record))
1985                 elog(ERROR,
1986                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
1987                          record.xlogid, record.xrecoff,
1988                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1989 }
1990
1991 /*
1992  * Flush xlog, but without specifying exactly where to flush to.
1993  *
1994  * We normally flush only completed blocks; but if there is nothing to do on
1995  * that basis, we check for unflushed async commits in the current incomplete
1996  * block, and flush through the latest one of those.  Thus, if async commits
1997  * are not being used, we will flush complete blocks only.      We can guarantee
1998  * that async commits reach disk after at most three cycles; normally only
1999  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
2000  * at the end of the buffer ring; this makes a difference only with very high
2001  * load or long wal_writer_delay, but imposes one extra cycle for the worst
2002  * case for async commits.)
2003  *
2004  * This routine is invoked periodically by the background walwriter process.
2005  */
2006 void
2007 XLogBackgroundFlush(void)
2008 {
2009         XLogRecPtr      WriteRqstPtr;
2010         bool            flexible = true;
2011
2012         /* XLOG doesn't need flushing during recovery */
2013         if (RecoveryInProgress())
2014                 return;
2015
2016         /* read LogwrtResult and update local state */
2017         {
2018                 /* use volatile pointer to prevent code rearrangement */
2019                 volatile XLogCtlData *xlogctl = XLogCtl;
2020
2021                 SpinLockAcquire(&xlogctl->info_lck);
2022                 LogwrtResult = xlogctl->LogwrtResult;
2023                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2024                 SpinLockRelease(&xlogctl->info_lck);
2025         }
2026
2027         /* back off to last completed page boundary */
2028         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
2029
2030         /* if we have already flushed that far, consider async commit records */
2031         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2032         {
2033                 /* use volatile pointer to prevent code rearrangement */
2034                 volatile XLogCtlData *xlogctl = XLogCtl;
2035
2036                 SpinLockAcquire(&xlogctl->info_lck);
2037                 WriteRqstPtr = xlogctl->asyncCommitLSN;
2038                 SpinLockRelease(&xlogctl->info_lck);
2039                 flexible = false;               /* ensure it all gets written */
2040         }
2041
2042         /* Done if already known flushed */
2043         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2044                 return;
2045
2046 #ifdef WAL_DEBUG
2047         if (XLOG_DEBUG)
2048                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2049                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
2050                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2051                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2052 #endif
2053
2054         START_CRIT_SECTION();
2055
2056         /* now wait for the write lock */
2057         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2058         LogwrtResult = XLogCtl->Write.LogwrtResult;
2059         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2060         {
2061                 XLogwrtRqst WriteRqst;
2062
2063                 WriteRqst.Write = WriteRqstPtr;
2064                 WriteRqst.Flush = WriteRqstPtr;
2065                 XLogWrite(WriteRqst, flexible, false);
2066         }
2067         LWLockRelease(WALWriteLock);
2068
2069         END_CRIT_SECTION();
2070 }
2071
2072 /*
2073  * Flush any previous asynchronously-committed transactions' commit records.
2074  *
2075  * NOTE: it is unwise to assume that this provides any strong guarantees.
2076  * In particular, because of the inexact LSN bookkeeping used by clog.c,
2077  * we cannot assume that hint bits will be settable for these transactions.
2078  */
2079 void
2080 XLogAsyncCommitFlush(void)
2081 {
2082         XLogRecPtr      WriteRqstPtr;
2083
2084         /* use volatile pointer to prevent code rearrangement */
2085         volatile XLogCtlData *xlogctl = XLogCtl;
2086
2087         /* There's no asynchronously committed transactions during recovery */
2088         if (RecoveryInProgress())
2089                 return;
2090
2091         SpinLockAcquire(&xlogctl->info_lck);
2092         WriteRqstPtr = xlogctl->asyncCommitLSN;
2093         SpinLockRelease(&xlogctl->info_lck);
2094
2095         XLogFlush(WriteRqstPtr);
2096 }
2097
2098 /*
2099  * Test whether XLOG data has been flushed up to (at least) the given position.
2100  *
2101  * Returns true if a flush is still needed.  (It may be that someone else
2102  * is already in process of flushing that far, however.)
2103  */
2104 bool
2105 XLogNeedsFlush(XLogRecPtr record)
2106 {
2107         /* XLOG doesn't need flushing during recovery */
2108         if (RecoveryInProgress())
2109                 return false;
2110
2111         /* Quick exit if already known flushed */
2112         if (XLByteLE(record, LogwrtResult.Flush))
2113                 return false;
2114
2115         /* read LogwrtResult and update local state */
2116         {
2117                 /* use volatile pointer to prevent code rearrangement */
2118                 volatile XLogCtlData *xlogctl = XLogCtl;
2119
2120                 SpinLockAcquire(&xlogctl->info_lck);
2121                 LogwrtResult = xlogctl->LogwrtResult;
2122                 SpinLockRelease(&xlogctl->info_lck);
2123         }
2124
2125         /* check again */
2126         if (XLByteLE(record, LogwrtResult.Flush))
2127                 return false;
2128
2129         return true;
2130 }
2131
2132 /*
2133  * Create a new XLOG file segment, or open a pre-existing one.
2134  *
2135  * log, seg: identify segment to be created/opened.
2136  *
2137  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2138  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2139  * file was used.
2140  *
2141  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2142  * place.  This should be TRUE except during bootstrap log creation.  The
2143  * caller must *not* hold the lock at call.
2144  *
2145  * Returns FD of opened file.
2146  *
2147  * Note: errors here are ERROR not PANIC because we might or might not be
2148  * inside a critical section (eg, during checkpoint there is no reason to
2149  * take down the system on failure).  They will promote to PANIC if we are
2150  * in a critical section.
2151  */
2152 static int
2153 XLogFileInit(uint32 log, uint32 seg,
2154                          bool *use_existent, bool use_lock)
2155 {
2156         char            path[MAXPGPATH];
2157         char            tmppath[MAXPGPATH];
2158         char       *zbuffer;
2159         uint32          installed_log;
2160         uint32          installed_seg;
2161         int                     max_advance;
2162         int                     fd;
2163         int                     nbytes;
2164
2165         XLogFilePath(path, ThisTimeLineID, log, seg);
2166
2167         /*
2168          * Try to use existent file (checkpoint maker may have created it already)
2169          */
2170         if (*use_existent)
2171         {
2172                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2173                                                    S_IRUSR | S_IWUSR);
2174                 if (fd < 0)
2175                 {
2176                         if (errno != ENOENT)
2177                                 ereport(ERROR,
2178                                                 (errcode_for_file_access(),
2179                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2180                                                                 path, log, seg)));
2181                 }
2182                 else
2183                         return fd;
2184         }
2185
2186         /*
2187          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2188          * another process is doing the same thing.  If so, we will end up
2189          * pre-creating an extra log segment.  That seems OK, and better than
2190          * holding the lock throughout this lengthy process.
2191          */
2192         elog(DEBUG2, "creating and filling new WAL file");
2193
2194         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2195
2196         unlink(tmppath);
2197
2198         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2199         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2200                                            S_IRUSR | S_IWUSR);
2201         if (fd < 0)
2202                 ereport(ERROR,
2203                                 (errcode_for_file_access(),
2204                                  errmsg("could not create file \"%s\": %m", tmppath)));
2205
2206         /*
2207          * Zero-fill the file.  We have to do this the hard way to ensure that all
2208          * the file space has really been allocated --- on platforms that allow
2209          * "holes" in files, just seeking to the end doesn't allocate intermediate
2210          * space.  This way, we know that we have all the space and (after the
2211          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2212          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2213          * log file.
2214          *
2215          * Note: palloc zbuffer, instead of just using a local char array, to
2216          * ensure it is reasonably well-aligned; this may save a few cycles
2217          * transferring data to the kernel.
2218          */
2219         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2220         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2221         {
2222                 errno = 0;
2223                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2224                 {
2225                         int                     save_errno = errno;
2226
2227                         /*
2228                          * If we fail to make the file, delete it to release disk space
2229                          */
2230                         unlink(tmppath);
2231                         /* if write didn't set errno, assume problem is no disk space */
2232                         errno = save_errno ? save_errno : ENOSPC;
2233
2234                         ereport(ERROR,
2235                                         (errcode_for_file_access(),
2236                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2237                 }
2238         }
2239         pfree(zbuffer);
2240
2241         if (pg_fsync(fd) != 0)
2242                 ereport(ERROR,
2243                                 (errcode_for_file_access(),
2244                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2245
2246         if (close(fd))
2247                 ereport(ERROR,
2248                                 (errcode_for_file_access(),
2249                                  errmsg("could not close file \"%s\": %m", tmppath)));
2250
2251         /*
2252          * Now move the segment into place with its final name.
2253          *
2254          * If caller didn't want to use a pre-existing file, get rid of any
2255          * pre-existing file.  Otherwise, cope with possibility that someone else
2256          * has created the file while we were filling ours: if so, use ours to
2257          * pre-create a future log segment.
2258          */
2259         installed_log = log;
2260         installed_seg = seg;
2261         max_advance = XLOGfileslop;
2262         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2263                                                                 *use_existent, &max_advance,
2264                                                                 use_lock))
2265         {
2266                 /* No need for any more future segments... */
2267                 unlink(tmppath);
2268         }
2269
2270         elog(DEBUG2, "done creating and filling new WAL file");
2271
2272         /* Set flag to tell caller there was no existent file */
2273         *use_existent = false;
2274
2275         /* Now open original target segment (might not be file I just made) */
2276         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2277                                            S_IRUSR | S_IWUSR);
2278         if (fd < 0)
2279                 ereport(ERROR,
2280                                 (errcode_for_file_access(),
2281                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2282                                   path, log, seg)));
2283
2284         return fd;
2285 }
2286
2287 /*
2288  * Create a new XLOG file segment by copying a pre-existing one.
2289  *
2290  * log, seg: identify segment to be created.
2291  *
2292  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2293  *              a different timeline)
2294  *
2295  * Currently this is only used during recovery, and so there are no locking
2296  * considerations.      But we should be just as tense as XLogFileInit to avoid
2297  * emplacing a bogus file.
2298  */
2299 static void
2300 XLogFileCopy(uint32 log, uint32 seg,
2301                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2302 {
2303         char            path[MAXPGPATH];
2304         char            tmppath[MAXPGPATH];
2305         char            buffer[XLOG_BLCKSZ];
2306         int                     srcfd;
2307         int                     fd;
2308         int                     nbytes;
2309
2310         /*
2311          * Open the source file
2312          */
2313         XLogFilePath(path, srcTLI, srclog, srcseg);
2314         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2315         if (srcfd < 0)
2316                 ereport(ERROR,
2317                                 (errcode_for_file_access(),
2318                                  errmsg("could not open file \"%s\": %m", path)));
2319
2320         /*
2321          * Copy into a temp file name.
2322          */
2323         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2324
2325         unlink(tmppath);
2326
2327         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2328         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2329                                            S_IRUSR | S_IWUSR);
2330         if (fd < 0)
2331                 ereport(ERROR,
2332                                 (errcode_for_file_access(),
2333                                  errmsg("could not create file \"%s\": %m", tmppath)));
2334
2335         /*
2336          * Do the data copying.
2337          */
2338         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2339         {
2340                 errno = 0;
2341                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2342                 {
2343                         if (errno != 0)
2344                                 ereport(ERROR,
2345                                                 (errcode_for_file_access(),
2346                                                  errmsg("could not read file \"%s\": %m", path)));
2347                         else
2348                                 ereport(ERROR,
2349                                                 (errmsg("not enough data in file \"%s\"", path)));
2350                 }
2351                 errno = 0;
2352                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2353                 {
2354                         int                     save_errno = errno;
2355
2356                         /*
2357                          * If we fail to make the file, delete it to release disk space
2358                          */
2359                         unlink(tmppath);
2360                         /* if write didn't set errno, assume problem is no disk space */
2361                         errno = save_errno ? save_errno : ENOSPC;
2362
2363                         ereport(ERROR,
2364                                         (errcode_for_file_access(),
2365                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2366                 }
2367         }
2368
2369         if (pg_fsync(fd) != 0)
2370                 ereport(ERROR,
2371                                 (errcode_for_file_access(),
2372                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2373
2374         if (close(fd))
2375                 ereport(ERROR,
2376                                 (errcode_for_file_access(),
2377                                  errmsg("could not close file \"%s\": %m", tmppath)));
2378
2379         close(srcfd);
2380
2381         /*
2382          * Now move the segment into place with its final name.
2383          */
2384         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2385                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2386 }
2387
2388 /*
2389  * Install a new XLOG segment file as a current or future log segment.
2390  *
2391  * This is used both to install a newly-created segment (which has a temp
2392  * filename while it's being created) and to recycle an old segment.
2393  *
2394  * *log, *seg: identify segment to install as (or first possible target).
2395  * When find_free is TRUE, these are modified on return to indicate the
2396  * actual installation location or last segment searched.
2397  *
2398  * tmppath: initial name of file to install.  It will be renamed into place.
2399  *
2400  * find_free: if TRUE, install the new segment at the first empty log/seg
2401  * number at or after the passed numbers.  If FALSE, install the new segment
2402  * exactly where specified, deleting any existing segment file there.
2403  *
2404  * *max_advance: maximum number of log/seg slots to advance past the starting
2405  * point.  Fail if no free slot is found in this range.  On return, reduced
2406  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2407  * when find_free is FALSE.)
2408  *
2409  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2410  * place.  This should be TRUE except during bootstrap log creation.  The
2411  * caller must *not* hold the lock at call.
2412  *
2413  * Returns TRUE if file installed, FALSE if not installed because of
2414  * exceeding max_advance limit.  On Windows, we also return FALSE if we
2415  * can't rename the file into place because someone's got it open.
2416  * (Any other kind of failure causes ereport().)
2417  */
2418 static bool
2419 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2420                                            bool find_free, int *max_advance,
2421                                            bool use_lock)
2422 {
2423         char            path[MAXPGPATH];
2424         struct stat stat_buf;
2425
2426         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2427
2428         /*
2429          * We want to be sure that only one process does this at a time.
2430          */
2431         if (use_lock)
2432                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2433
2434         if (!find_free)
2435         {
2436                 /* Force installation: get rid of any pre-existing segment file */
2437                 unlink(path);
2438         }
2439         else
2440         {
2441                 /* Find a free slot to put it in */
2442                 while (stat(path, &stat_buf) == 0)
2443                 {
2444                         if (*max_advance <= 0)
2445                         {
2446                                 /* Failed to find a free slot within specified range */
2447                                 if (use_lock)
2448                                         LWLockRelease(ControlFileLock);
2449                                 return false;
2450                         }
2451                         NextLogSeg(*log, *seg);
2452                         (*max_advance)--;
2453                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2454                 }
2455         }
2456
2457         /*
2458          * Prefer link() to rename() here just to be really sure that we don't
2459          * overwrite an existing logfile.  However, there shouldn't be one, so
2460          * rename() is an acceptable substitute except for the truly paranoid.
2461          */
2462 #if HAVE_WORKING_LINK
2463         if (link(tmppath, path) < 0)
2464                 ereport(ERROR,
2465                                 (errcode_for_file_access(),
2466                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2467                                                 tmppath, path, *log, *seg)));
2468         unlink(tmppath);
2469 #else
2470         if (rename(tmppath, path) < 0)
2471         {
2472 #ifdef WIN32
2473 #if !defined(__CYGWIN__)
2474                 if (GetLastError() == ERROR_ACCESS_DENIED)
2475 #else
2476                 if (errno == EACCES)
2477 #endif
2478                 {
2479                         if (use_lock)
2480                                 LWLockRelease(ControlFileLock);
2481                         return false;
2482                 }
2483 #endif   /* WIN32 */
2484
2485                 ereport(ERROR,
2486                                 (errcode_for_file_access(),
2487                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2488                                                 tmppath, path, *log, *seg)));
2489         }
2490 #endif
2491
2492         if (use_lock)
2493                 LWLockRelease(ControlFileLock);
2494
2495         return true;
2496 }
2497
2498 /*
2499  * Open a pre-existing logfile segment for writing.
2500  */
2501 static int
2502 XLogFileOpen(uint32 log, uint32 seg)
2503 {
2504         char            path[MAXPGPATH];
2505         int                     fd;
2506
2507         XLogFilePath(path, ThisTimeLineID, log, seg);
2508
2509         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2510                                            S_IRUSR | S_IWUSR);
2511         if (fd < 0)
2512                 ereport(PANIC,
2513                                 (errcode_for_file_access(),
2514                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2515                                   path, log, seg)));
2516
2517         return fd;
2518 }
2519
2520 /*
2521  * Open a logfile segment for reading (during recovery).
2522  */
2523 static int
2524 XLogFileRead(uint32 log, uint32 seg, int emode)
2525 {
2526         char            path[MAXPGPATH];
2527         char            xlogfname[MAXFNAMELEN];
2528         char            activitymsg[MAXFNAMELEN + 16];
2529         ListCell   *cell;
2530         int                     fd;
2531
2532         /*
2533          * Loop looking for a suitable timeline ID: we might need to read any of
2534          * the timelines listed in expectedTLIs.
2535          *
2536          * We expect curFileTLI on entry to be the TLI of the preceding file in
2537          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2538          * to go backwards; this prevents us from picking up the wrong file when a
2539          * parent timeline extends to higher segment numbers than the child we
2540          * want to read.
2541          */
2542         foreach(cell, expectedTLIs)
2543         {
2544                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2545
2546                 if (tli < curFileTLI)
2547                         break;                          /* don't bother looking at too-old TLIs */
2548
2549                 XLogFileName(xlogfname, tli, log, seg);
2550
2551                 if (InArchiveRecovery)
2552                 {
2553                         /* Report recovery progress in PS display */
2554                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2555                                          xlogfname);
2556                         set_ps_display(activitymsg, false);
2557
2558                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2559                                                                                                           "RECOVERYXLOG",
2560                                                                                                           XLogSegSize);
2561                 }
2562                 else
2563                         XLogFilePath(path, tli, log, seg);
2564
2565                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2566                 if (fd >= 0)
2567                 {
2568                         /* Success! */
2569                         curFileTLI = tli;
2570
2571                         /* Report recovery progress in PS display */
2572                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2573                                          xlogfname);
2574                         set_ps_display(activitymsg, false);
2575
2576                         return fd;
2577                 }
2578                 if (errno != ENOENT)    /* unexpected failure? */
2579                         ereport(PANIC,
2580                                         (errcode_for_file_access(),
2581                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2582                                    path, log, seg)));
2583         }
2584
2585         /* Couldn't find it.  For simplicity, complain about front timeline */
2586         XLogFilePath(path, recoveryTargetTLI, log, seg);
2587         errno = ENOENT;
2588         ereport(emode,
2589                         (errcode_for_file_access(),
2590                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2591                                   path, log, seg)));
2592         return -1;
2593 }
2594
2595 /*
2596  * Close the current logfile segment for writing.
2597  */
2598 static void
2599 XLogFileClose(void)
2600 {
2601         Assert(openLogFile >= 0);
2602
2603         /*
2604          * WAL segment files will not be re-read in normal operation, so we advise
2605          * the OS to release any cached pages.  But do not do so if WAL archiving
2606          * is active, because archiver process could use the cache to read the WAL
2607          * segment.  Also, don't bother with it if we are using O_DIRECT, since
2608          * the kernel is presumably not caching in that case.
2609          */
2610 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2611         if (!XLogArchivingActive() &&
2612                 (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
2613                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2614 #endif
2615
2616         if (close(openLogFile))
2617                 ereport(PANIC,
2618                                 (errcode_for_file_access(),
2619                                  errmsg("could not close log file %u, segment %u: %m",
2620                                                 openLogId, openLogSeg)));
2621         openLogFile = -1;
2622 }
2623
2624 /*
2625  * Attempt to retrieve the specified file from off-line archival storage.
2626  * If successful, fill "path" with its complete path (note that this will be
2627  * a temp file name that doesn't follow the normal naming convention), and
2628  * return TRUE.
2629  *
2630  * If not successful, fill "path" with the name of the normal on-line file
2631  * (which may or may not actually exist, but we'll try to use it), and return
2632  * FALSE.
2633  *
2634  * For fixed-size files, the caller may pass the expected size as an
2635  * additional crosscheck on successful recovery.  If the file size is not
2636  * known, set expectedSize = 0.
2637  */
2638 static bool
2639 RestoreArchivedFile(char *path, const char *xlogfname,
2640                                         const char *recovername, off_t expectedSize)
2641 {
2642         char            xlogpath[MAXPGPATH];
2643         char            xlogRestoreCmd[MAXPGPATH];
2644         char            lastRestartPointFname[MAXPGPATH];
2645         char       *dp;
2646         char       *endp;
2647         const char *sp;
2648         int                     rc;
2649         bool            signaled;
2650         struct stat stat_buf;
2651         uint32          restartLog;
2652         uint32          restartSeg;
2653
2654         /*
2655          * When doing archive recovery, we always prefer an archived log file even
2656          * if a file of the same name exists in XLOGDIR.  The reason is that the
2657          * file in XLOGDIR could be an old, un-filled or partly-filled version
2658          * that was copied and restored as part of backing up $PGDATA.
2659          *
2660          * We could try to optimize this slightly by checking the local copy
2661          * lastchange timestamp against the archived copy, but we have no API to
2662          * do this, nor can we guarantee that the lastchange timestamp was
2663          * preserved correctly when we copied to archive. Our aim is robustness,
2664          * so we elect not to do this.
2665          *
2666          * If we cannot obtain the log file from the archive, however, we will try
2667          * to use the XLOGDIR file if it exists.  This is so that we can make use
2668          * of log segments that weren't yet transferred to the archive.
2669          *
2670          * Notice that we don't actually overwrite any files when we copy back
2671          * from archive because the recoveryRestoreCommand may inadvertently
2672          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2673          * fallback to the segments remaining in current XLOGDIR later. The
2674          * copy-from-archive filename is always the same, ensuring that we don't
2675          * run out of disk space on long recoveries.
2676          */
2677         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2678
2679         /*
2680          * Make sure there is no existing file named recovername.
2681          */
2682         if (stat(xlogpath, &stat_buf) != 0)
2683         {
2684                 if (errno != ENOENT)
2685                         ereport(FATAL,
2686                                         (errcode_for_file_access(),
2687                                          errmsg("could not stat file \"%s\": %m",
2688                                                         xlogpath)));
2689         }
2690         else
2691         {
2692                 if (unlink(xlogpath) != 0)
2693                         ereport(FATAL,
2694                                         (errcode_for_file_access(),
2695                                          errmsg("could not remove file \"%s\": %m",
2696                                                         xlogpath)));
2697         }
2698
2699         /*
2700          * Calculate the archive file cutoff point for use during log shipping
2701          * replication. All files earlier than this point can be deleted from the
2702          * archive, though there is no requirement to do so.
2703          *
2704          * We initialise this with the filename of an InvalidXLogRecPtr, which
2705          * will prevent the deletion of any WAL files from the archive because of
2706          * the alphabetic sorting property of WAL filenames.
2707          *
2708          * Once we have successfully located the redo pointer of the checkpoint
2709          * from which we start recovery we never request a file prior to the redo
2710          * pointer of the last restartpoint. When redo begins we know that we have
2711          * successfully located it, so there is no need for additional status
2712          * flags to signify the point when we can begin deleting WAL files from
2713          * the archive.
2714          */
2715         if (InRedo)
2716         {
2717                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2718                                         restartLog, restartSeg);
2719                 XLogFileName(lastRestartPointFname,
2720                                          ControlFile->checkPointCopy.ThisTimeLineID,
2721                                          restartLog, restartSeg);
2722                 /* we shouldn't need anything earlier than last restart point */
2723                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2724         }
2725         else
2726                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2727
2728         /*
2729          * construct the command to be executed
2730          */
2731         dp = xlogRestoreCmd;
2732         endp = xlogRestoreCmd + MAXPGPATH - 1;
2733         *endp = '\0';
2734
2735         for (sp = recoveryRestoreCommand; *sp; sp++)
2736         {
2737                 if (*sp == '%')
2738                 {
2739                         switch (sp[1])
2740                         {
2741                                 case 'p':
2742                                         /* %p: relative path of target file */
2743                                         sp++;
2744                                         StrNCpy(dp, xlogpath, endp - dp);
2745                                         make_native_path(dp);
2746                                         dp += strlen(dp);
2747                                         break;
2748                                 case 'f':
2749                                         /* %f: filename of desired file */
2750                                         sp++;
2751                                         StrNCpy(dp, xlogfname, endp - dp);
2752                                         dp += strlen(dp);
2753                                         break;
2754                                 case 'r':
2755                                         /* %r: filename of last restartpoint */
2756                                         sp++;
2757                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2758                                         dp += strlen(dp);
2759                                         break;
2760                                 case '%':
2761                                         /* convert %% to a single % */
2762                                         sp++;
2763                                         if (dp < endp)
2764                                                 *dp++ = *sp;
2765                                         break;
2766                                 default:
2767                                         /* otherwise treat the % as not special */
2768                                         if (dp < endp)
2769                                                 *dp++ = *sp;
2770                                         break;
2771                         }
2772                 }
2773                 else
2774                 {
2775                         if (dp < endp)
2776                                 *dp++ = *sp;
2777                 }
2778         }
2779         *dp = '\0';
2780
2781         ereport(DEBUG3,
2782                         (errmsg_internal("executing restore command \"%s\"",
2783                                                          xlogRestoreCmd)));
2784
2785         /*
2786          * Set in_restore_command to tell the signal handler that we should exit
2787          * right away on SIGTERM. We know that we're at a safe point to do that.
2788          * Check if we had already received the signal, so that we don't miss a
2789          * shutdown request received just before this.
2790          */
2791         in_restore_command = true;
2792         if (shutdown_requested)
2793                 proc_exit(1);
2794
2795         /*
2796          * Copy xlog from archival storage to XLOGDIR
2797          */
2798         rc = system(xlogRestoreCmd);
2799
2800         in_restore_command = false;
2801
2802         if (rc == 0)
2803         {
2804                 /*
2805                  * command apparently succeeded, but let's make sure the file is
2806                  * really there now and has the correct size.
2807                  *
2808                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
2809                  * it, but is that too strong?  We could try to plow ahead with a
2810                  * local copy of the file ... but the problem is that there probably
2811                  * isn't one, and we'd incorrectly conclude we've reached the end of
2812                  * WAL and we're done recovering ...
2813                  */
2814                 if (stat(xlogpath, &stat_buf) == 0)
2815                 {
2816                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2817                                 ereport(FATAL,
2818                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
2819                                                                 xlogfname,
2820                                                                 (unsigned long) stat_buf.st_size,
2821                                                                 (unsigned long) expectedSize)));
2822                         else
2823                         {
2824                                 ereport(LOG,
2825                                                 (errmsg("restored log file \"%s\" from archive",
2826                                                                 xlogfname)));
2827                                 strcpy(path, xlogpath);
2828                                 return true;
2829                         }
2830                 }
2831                 else
2832                 {
2833                         /* stat failed */
2834                         if (errno != ENOENT)
2835                                 ereport(FATAL,
2836                                                 (errcode_for_file_access(),
2837                                                  errmsg("could not stat file \"%s\": %m",
2838                                                                 xlogpath)));
2839                 }
2840         }
2841
2842         /*
2843          * Remember, we rollforward UNTIL the restore fails so failure here is
2844          * just part of the process... that makes it difficult to determine
2845          * whether the restore failed because there isn't an archive to restore,
2846          * or because the administrator has specified the restore program
2847          * incorrectly.  We have to assume the former.
2848          *
2849          * However, if the failure was due to any sort of signal, it's best to
2850          * punt and abort recovery.  (If we "return false" here, upper levels will
2851          * assume that recovery is complete and start up the database!) It's
2852          * essential to abort on child SIGINT and SIGQUIT, because per spec
2853          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2854          * those it's a good bet we should have gotten it too.
2855          *
2856          * On SIGTERM, assume we have received a fast shutdown request, and exit
2857          * cleanly. It's pure chance whether we receive the SIGTERM first, or the
2858          * child process. If we receive it first, the signal handler will call
2859          * proc_exit, otherwise we do it here. If we or the child process received
2860          * SIGTERM for any other reason than a fast shutdown request, postmaster
2861          * will perform an immediate shutdown when it sees us exiting
2862          * unexpectedly.
2863          *
2864          * Per the Single Unix Spec, shells report exit status > 128 when a called
2865          * command died on a signal.  Also, 126 and 127 are used to report
2866          * problems such as an unfindable command; treat those as fatal errors
2867          * too.
2868          */
2869         if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
2870                 proc_exit(1);
2871
2872         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2873
2874         ereport(signaled ? FATAL : DEBUG2,
2875                 (errmsg("could not restore file \"%s\" from archive: return code %d",
2876                                 xlogfname, rc)));
2877
2878         /*
2879          * if an archived file is not available, there might still be a version of
2880          * this file in XLOGDIR, so return that as the filename to open.
2881          *
2882          * In many recovery scenarios we expect this to fail also, but if so that
2883          * just means we've reached the end of WAL.
2884          */
2885         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2886         return false;
2887 }
2888
2889 /*
2890  * Attempt to execute the recovery_end_command.
2891  */
2892 static void
2893 ExecuteRecoveryEndCommand(void)
2894 {
2895         char            xlogRecoveryEndCmd[MAXPGPATH];
2896         char            lastRestartPointFname[MAXPGPATH];
2897         char       *dp;
2898         char       *endp;
2899         const char *sp;
2900         int                     rc;
2901         bool            signaled;
2902         uint32          restartLog;
2903         uint32          restartSeg;
2904
2905         Assert(recoveryEndCommand);
2906
2907         /*
2908          * Calculate the archive file cutoff point for use during log shipping
2909          * replication. All files earlier than this point can be deleted from the
2910          * archive, though there is no requirement to do so.
2911          *
2912          * We initialise this with the filename of an InvalidXLogRecPtr, which
2913          * will prevent the deletion of any WAL files from the archive because of
2914          * the alphabetic sorting property of WAL filenames.
2915          *
2916          * Once we have successfully located the redo pointer of the checkpoint
2917          * from which we start recovery we never request a file prior to the redo
2918          * pointer of the last restartpoint. When redo begins we know that we have
2919          * successfully located it, so there is no need for additional status
2920          * flags to signify the point when we can begin deleting WAL files from
2921          * the archive.
2922          */
2923         if (InRedo)
2924         {
2925                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2926                                         restartLog, restartSeg);
2927                 XLogFileName(lastRestartPointFname,
2928                                          ControlFile->checkPointCopy.ThisTimeLineID,
2929                                          restartLog, restartSeg);
2930         }
2931         else
2932                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2933
2934         /*
2935          * construct the command to be executed
2936          */
2937         dp = xlogRecoveryEndCmd;
2938         endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
2939         *endp = '\0';
2940
2941         for (sp = recoveryEndCommand; *sp; sp++)
2942         {
2943                 if (*sp == '%')
2944                 {
2945                         switch (sp[1])
2946                         {
2947                                 case 'r':
2948                                         /* %r: filename of last restartpoint */
2949                                         sp++;
2950                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2951                                         dp += strlen(dp);
2952                                         break;
2953                                 case '%':
2954                                         /* convert %% to a single % */
2955                                         sp++;
2956                                         if (dp < endp)
2957                                                 *dp++ = *sp;
2958                                         break;
2959                                 default:
2960                                         /* otherwise treat the % as not special */
2961                                         if (dp < endp)
2962                                                 *dp++ = *sp;
2963                                         break;
2964                         }
2965                 }
2966                 else
2967                 {
2968                         if (dp < endp)
2969                                 *dp++ = *sp;
2970                 }
2971         }
2972         *dp = '\0';
2973
2974         ereport(DEBUG3,
2975                         (errmsg_internal("executing recovery end command \"%s\"",
2976                                                          xlogRecoveryEndCmd)));
2977
2978         /*
2979          * execute the constructed command
2980          */
2981         rc = system(xlogRecoveryEndCmd);
2982         if (rc != 0)
2983         {
2984                 /*
2985                  * If the failure was due to any sort of signal, it's best to punt and
2986                  * abort recovery. See also detailed comments on signals in
2987                  * RestoreArchivedFile().
2988                  */
2989                 signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2990
2991                 ereport(signaled ? FATAL : WARNING,
2992                                 (errmsg("recovery_end_command \"%s\": return code %d",
2993                                                 xlogRecoveryEndCmd, rc)));
2994         }
2995 }
2996
2997 /*
2998  * Preallocate log files beyond the specified log endpoint.
2999  *
3000  * XXX this is currently extremely conservative, since it forces only one
3001  * future log segment to exist, and even that only if we are 75% done with
3002  * the current one.  This is only appropriate for very low-WAL-volume systems.
3003  * High-volume systems will be OK once they've built up a sufficient set of
3004  * recycled log segments, but the startup transient is likely to include
3005  * a lot of segment creations by foreground processes, which is not so good.
3006  */
3007 static void
3008 PreallocXlogFiles(XLogRecPtr endptr)
3009 {
3010         uint32          _logId;
3011         uint32          _logSeg;
3012         int                     lf;
3013         bool            use_existent;
3014
3015         XLByteToPrevSeg(endptr, _logId, _logSeg);
3016         if ((endptr.xrecoff - 1) % XLogSegSize >=
3017                 (uint32) (0.75 * XLogSegSize))
3018         {
3019                 NextLogSeg(_logId, _logSeg);
3020                 use_existent = true;
3021                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
3022                 close(lf);
3023                 if (!use_existent)
3024                         CheckpointStats.ckpt_segs_added++;
3025         }
3026 }
3027
3028 /*
3029  * Recycle or remove all log files older or equal to passed log/seg#
3030  *
3031  * endptr is current (or recent) end of xlog; this is used to determine
3032  * whether we want to recycle rather than delete no-longer-wanted log files.
3033  */
3034 static void
3035 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
3036 {
3037         uint32          endlogId;
3038         uint32          endlogSeg;
3039         int                     max_advance;
3040         DIR                *xldir;
3041         struct dirent *xlde;
3042         char            lastoff[MAXFNAMELEN];
3043         char            path[MAXPGPATH];
3044         struct stat statbuf;
3045
3046         /*
3047          * Initialize info about where to try to recycle to.  We allow recycling
3048          * segments up to XLOGfileslop segments beyond the current XLOG location.
3049          */
3050         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3051         max_advance = XLOGfileslop;
3052
3053         xldir = AllocateDir(XLOGDIR);
3054         if (xldir == NULL)
3055                 ereport(ERROR,
3056                                 (errcode_for_file_access(),
3057                                  errmsg("could not open transaction log directory \"%s\": %m",
3058                                                 XLOGDIR)));
3059
3060         XLogFileName(lastoff, ThisTimeLineID, log, seg);
3061
3062         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3063         {
3064                 /*
3065                  * We ignore the timeline part of the XLOG segment identifiers in
3066                  * deciding whether a segment is still needed.  This ensures that we
3067                  * won't prematurely remove a segment from a parent timeline. We could
3068                  * probably be a little more proactive about removing segments of
3069                  * non-parent timelines, but that would be a whole lot more
3070                  * complicated.
3071                  *
3072                  * We use the alphanumeric sorting property of the filenames to decide
3073                  * which ones are earlier than the lastoff segment.
3074                  */
3075                 if (strlen(xlde->d_name) == 24 &&
3076                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3077                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3078                 {
3079                         if (XLogArchiveCheckDone(xlde->d_name))
3080                         {
3081                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3082
3083                                 /*
3084                                  * Before deleting the file, see if it can be recycled as a
3085                                  * future log segment. Only recycle normal files, pg_standby
3086                                  * for example can create symbolic links pointing to a
3087                                  * separate archive directory.
3088                                  */
3089                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3090                                         InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3091                                                                                    true, &max_advance, true))
3092                                 {
3093                                         ereport(DEBUG2,
3094                                                         (errmsg("recycled transaction log file \"%s\"",
3095                                                                         xlde->d_name)));
3096                                         CheckpointStats.ckpt_segs_recycled++;
3097                                         /* Needn't recheck that slot on future iterations */
3098                                         if (max_advance > 0)
3099                                         {
3100                                                 NextLogSeg(endlogId, endlogSeg);
3101                                                 max_advance--;
3102                                         }
3103                                 }
3104                                 else
3105                                 {
3106                                         /* No need for any more future segments... */
3107                                         ereport(DEBUG2,
3108                                                         (errmsg("removing transaction log file \"%s\"",
3109                                                                         xlde->d_name)));
3110                                         unlink(path);
3111                                         CheckpointStats.ckpt_segs_removed++;
3112                                 }
3113
3114                                 XLogArchiveCleanup(xlde->d_name);
3115                         }
3116                 }
3117         }
3118
3119         FreeDir(xldir);
3120 }
3121
3122 /*
3123  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3124  * If the latter does not exist, recreate it.
3125  *
3126  * It is not the goal of this function to verify the contents of these
3127  * directories, but to help in cases where someone has performed a cluster
3128  * copy for PITR purposes but omitted pg_xlog from the copy.
3129  *
3130  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3131  * policy decision was made not to.  It is fairly common for pg_xlog to be
3132  * a symlink, and if that was the DBA's intent then automatically making a
3133  * plain directory would result in degraded performance with no notice.
3134  */
3135 static void
3136 ValidateXLOGDirectoryStructure(void)
3137 {
3138         char            path[MAXPGPATH];
3139         struct stat stat_buf;
3140
3141         /* Check for pg_xlog; if it doesn't exist, error out */
3142         if (stat(XLOGDIR, &stat_buf) != 0 ||
3143                 !S_ISDIR(stat_buf.st_mode))
3144                 ereport(FATAL,
3145                                 (errmsg("required WAL directory \"%s\" does not exist",
3146                                                 XLOGDIR)));
3147
3148         /* Check for archive_status */
3149         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3150         if (stat(path, &stat_buf) == 0)
3151         {
3152                 /* Check for weird cases where it exists but isn't a directory */
3153                 if (!S_ISDIR(stat_buf.st_mode))
3154                         ereport(FATAL,
3155                                         (errmsg("required WAL directory \"%s\" does not exist",
3156                                                         path)));
3157         }
3158         else
3159         {
3160                 ereport(LOG,
3161                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3162                 if (mkdir(path, 0700) < 0)
3163                         ereport(FATAL,
3164                                         (errmsg("could not create missing directory \"%s\": %m",
3165                                                         path)));
3166         }
3167 }
3168
3169 /*
3170  * Remove previous backup history files.  This also retries creation of
3171  * .ready files for any backup history files for which XLogArchiveNotify
3172  * failed earlier.
3173  */
3174 static void
3175 CleanupBackupHistory(void)
3176 {
3177         DIR                *xldir;
3178         struct dirent *xlde;
3179         char            path[MAXPGPATH];
3180
3181         xldir = AllocateDir(XLOGDIR);
3182         if (xldir == NULL)
3183                 ereport(ERROR,
3184                                 (errcode_for_file_access(),
3185                                  errmsg("could not open transaction log directory \"%s\": %m",
3186                                                 XLOGDIR)));
3187
3188         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3189         {
3190                 if (strlen(xlde->d_name) > 24 &&
3191                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3192                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3193                                    ".backup") == 0)
3194                 {
3195                         if (XLogArchiveCheckDone(xlde->d_name))
3196                         {
3197                                 ereport(DEBUG2,
3198                                 (errmsg("removing transaction log backup history file \"%s\"",
3199                                                 xlde->d_name)));
3200                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3201                                 unlink(path);
3202                                 XLogArchiveCleanup(xlde->d_name);
3203                         }
3204                 }
3205         }
3206
3207         FreeDir(xldir);
3208 }
3209
3210 /*
3211  * Restore the backup blocks present in an XLOG record, if any.
3212  *
3213  * We assume all of the record has been read into memory at *record.
3214  *
3215  * Note: when a backup block is available in XLOG, we restore it
3216  * unconditionally, even if the page in the database appears newer.
3217  * This is to protect ourselves against database pages that were partially
3218  * or incorrectly written during a crash.  We assume that the XLOG data
3219  * must be good because it has passed a CRC check, while the database
3220  * page might not be.  This will force us to replay all subsequent
3221  * modifications of the page that appear in XLOG, rather than possibly
3222  * ignoring them as already applied, but that's not a huge drawback.
3223  *
3224  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3225  * Otherwise, a normal exclusive lock is used.  At the moment, that's just
3226  * pro forma, because there can't be any regular backends in the system
3227  * during recovery.  The 'cleanup' argument applies to all backup blocks
3228  * in the WAL record, that suffices for now.
3229  */
3230 void
3231 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3232 {
3233         Buffer          buffer;
3234         Page            page;
3235         BkpBlock        bkpb;
3236         char       *blk;
3237         int                     i;
3238
3239         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
3240                 return;
3241
3242         blk = (char *) XLogRecGetData(record) + record->xl_len;
3243         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3244         {
3245                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3246                         continue;
3247
3248                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3249                 blk += sizeof(BkpBlock);
3250
3251                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3252                                                                                 RBM_ZERO);
3253                 Assert(BufferIsValid(buffer));
3254                 if (cleanup)
3255                         LockBufferForCleanup(buffer);
3256                 else
3257                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3258
3259                 page = (Page) BufferGetPage(buffer);
3260
3261                 if (bkpb.hole_length == 0)
3262                 {
3263                         memcpy((char *) page, blk, BLCKSZ);
3264                 }
3265                 else
3266                 {
3267                         /* must zero-fill the hole */
3268                         MemSet((char *) page, 0, BLCKSZ);
3269                         memcpy((char *) page, blk, bkpb.hole_offset);
3270                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3271                                    blk + bkpb.hole_offset,
3272                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3273                 }
3274
3275                 PageSetLSN(page, lsn);
3276                 PageSetTLI(page, ThisTimeLineID);
3277                 MarkBufferDirty(buffer);
3278                 UnlockReleaseBuffer(buffer);
3279
3280                 blk += BLCKSZ - bkpb.hole_length;
3281         }
3282 }
3283
3284 /*
3285  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3286  * record (other than to the minimal extent of computing the amount of
3287  * data to read in) until we've checked the CRCs.
3288  *
3289  * We assume all of the record has been read into memory at *record.
3290  */
3291 static bool
3292 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3293 {
3294         pg_crc32        crc;
3295         int                     i;
3296         uint32          len = record->xl_len;
3297         BkpBlock        bkpb;
3298         char       *blk;
3299
3300         /* First the rmgr data */
3301         INIT_CRC32(crc);
3302         COMP_CRC32(crc, XLogRecGetData(record), len);
3303
3304         /* Add in the backup blocks, if any */
3305         blk = (char *) XLogRecGetData(record) + len;
3306         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3307         {
3308                 uint32          blen;
3309
3310                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3311                         continue;
3312
3313                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3314                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3315                 {
3316                         ereport(emode,
3317                                         (errmsg("incorrect hole size in record at %X/%X",
3318                                                         recptr.xlogid, recptr.xrecoff)));
3319                         return false;
3320                 }
3321                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3322                 COMP_CRC32(crc, blk, blen);
3323                 blk += blen;
3324         }
3325
3326         /* Check that xl_tot_len agrees with our calculation */
3327         if (blk != (char *) record + record->xl_tot_len)
3328         {
3329                 ereport(emode,
3330                                 (errmsg("incorrect total length in record at %X/%X",
3331                                                 recptr.xlogid, recptr.xrecoff)));
3332                 return false;
3333         }
3334
3335         /* Finally include the record header */
3336         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3337                            SizeOfXLogRecord - sizeof(pg_crc32));
3338         FIN_CRC32(crc);
3339
3340         if (!EQ_CRC32(record->xl_crc, crc))
3341         {
3342                 ereport(emode,
3343                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3344                                 recptr.xlogid, recptr.xrecoff)));
3345                 return false;
3346         }
3347
3348         return true;
3349 }
3350
3351 /*
3352  * Attempt to read an XLOG record.
3353  *
3354  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3355  * try to read a record just after the last one previously read.
3356  *
3357  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3358  * (emode must be either PANIC or LOG.)
3359  *
3360  * The record is copied into readRecordBuf, so that on successful return,
3361  * the returned record pointer always points there.
3362  */
3363 static XLogRecord *
3364 ReadRecord(XLogRecPtr *RecPtr, int emode)
3365 {
3366         XLogRecord *record;
3367         char       *buffer;
3368         XLogRecPtr      tmpRecPtr = EndRecPtr;
3369         bool            randAccess = false;
3370         uint32          len,
3371                                 total_len;
3372         uint32          targetPageOff;
3373         uint32          targetRecOff;
3374         uint32          pageHeaderSize;
3375
3376         if (readBuf == NULL)
3377         {
3378                 /*
3379                  * First time through, permanently allocate readBuf.  We do it this
3380                  * way, rather than just making a static array, for two reasons: (1)
3381                  * no need to waste the storage in most instantiations of the backend;
3382                  * (2) a static char array isn't guaranteed to have any particular
3383                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3384                  */
3385                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3386                 Assert(readBuf != NULL);
3387         }
3388
3389         if (RecPtr == NULL)
3390         {
3391                 RecPtr = &tmpRecPtr;
3392                 /* fast case if next record is on same page */
3393                 if (nextRecord != NULL)
3394                 {
3395                         record = nextRecord;
3396                         goto got_record;
3397                 }
3398                 /* align old recptr to next page */
3399                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
3400                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
3401                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3402                 {
3403                         (tmpRecPtr.xlogid)++;
3404                         tmpRecPtr.xrecoff = 0;
3405                 }
3406                 /* We will account for page header size below */
3407         }
3408         else
3409         {
3410                 if (!XRecOffIsValid(RecPtr->xrecoff))
3411                         ereport(PANIC,
3412                                         (errmsg("invalid record offset at %X/%X",
3413                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3414
3415                 /*
3416                  * Since we are going to a random position in WAL, forget any prior
3417                  * state about what timeline we were in, and allow it to be any
3418                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3419                  * to go backwards (but we can't reset that variable right here, since
3420                  * we might not change files at all).
3421                  */
3422                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3423                 randAccess = true;              /* allow curFileTLI to go backwards too */
3424         }
3425
3426         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
3427         {
3428                 close(readFile);
3429                 readFile = -1;
3430         }
3431         XLByteToSeg(*RecPtr, readId, readSeg);
3432         if (readFile < 0)
3433         {
3434                 /* Now it's okay to reset curFileTLI if random fetch */
3435                 if (randAccess)
3436                         curFileTLI = 0;
3437
3438                 readFile = XLogFileRead(readId, readSeg, emode);
3439                 if (readFile < 0)
3440                         goto next_record_is_invalid;
3441
3442                 /*
3443                  * Whenever switching to a new WAL segment, we read the first page of
3444                  * the file and validate its header, even if that's not where the
3445                  * target record is.  This is so that we can check the additional
3446                  * identification info that is present in the first page's "long"
3447                  * header.
3448                  */
3449                 readOff = 0;
3450                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3451                 {
3452                         ereport(emode,
3453                                         (errcode_for_file_access(),
3454                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3455                                                         readId, readSeg, readOff)));
3456                         goto next_record_is_invalid;
3457                 }
3458                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3459                         goto next_record_is_invalid;
3460         }
3461
3462         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3463         if (readOff != targetPageOff)
3464         {
3465                 readOff = targetPageOff;
3466                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
3467                 {
3468                         ereport(emode,
3469                                         (errcode_for_file_access(),
3470                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
3471                                                         readId, readSeg, readOff)));
3472                         goto next_record_is_invalid;
3473                 }
3474                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3475                 {
3476                         ereport(emode,
3477                                         (errcode_for_file_access(),
3478                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3479                                                         readId, readSeg, readOff)));
3480                         goto next_record_is_invalid;
3481                 }
3482                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3483                         goto next_record_is_invalid;
3484         }
3485         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3486         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3487         if (targetRecOff == 0)
3488         {
3489                 /*
3490                  * Can only get here in the continuing-from-prev-page case, because
3491                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3492                  * to skip over the new page's header.
3493                  */
3494                 tmpRecPtr.xrecoff += pageHeaderSize;
3495                 targetRecOff = pageHeaderSize;
3496         }
3497         else if (targetRecOff < pageHeaderSize)
3498         {
3499                 ereport(emode,
3500                                 (errmsg("invalid record offset at %X/%X",
3501                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3502                 goto next_record_is_invalid;
3503         }
3504         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3505                 targetRecOff == pageHeaderSize)
3506         {
3507                 ereport(emode,
3508                                 (errmsg("contrecord is requested by %X/%X",
3509                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3510                 goto next_record_is_invalid;
3511         }
3512         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3513
3514 got_record:;
3515
3516         /*
3517          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3518          * required.
3519          */
3520         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3521         {
3522                 if (record->xl_len != 0)
3523                 {
3524                         ereport(emode,
3525                                         (errmsg("invalid xlog switch record at %X/%X",
3526                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3527                         goto next_record_is_invalid;
3528                 }
3529         }
3530         else if (record->xl_len == 0)
3531         {
3532                 ereport(emode,
3533                                 (errmsg("record with zero length at %X/%X",
3534                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3535                 goto next_record_is_invalid;
3536         }
3537         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3538                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3539                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3540         {
3541                 ereport(emode,
3542                                 (errmsg("invalid record length at %X/%X",
3543                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3544                 goto next_record_is_invalid;
3545         }
3546         if (record->xl_rmid > RM_MAX_ID)
3547         {
3548                 ereport(emode,
3549                                 (errmsg("invalid resource manager ID %u at %X/%X",
3550                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3551                 goto next_record_is_invalid;
3552         }
3553         if (randAccess)
3554         {
3555                 /*
3556                  * We can't exactly verify the prev-link, but surely it should be less
3557                  * than the record's own address.
3558                  */
3559                 if (!XLByteLT(record->xl_prev, *RecPtr))
3560                 {
3561                         ereport(emode,
3562                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3563                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3564                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3565                         goto next_record_is_invalid;
3566                 }
3567         }
3568         else
3569         {
3570                 /*
3571                  * Record's prev-link should exactly match our previous location. This
3572                  * check guards against torn WAL pages where a stale but valid-looking
3573                  * WAL record starts on a sector boundary.
3574                  */
3575                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3576                 {
3577                         ereport(emode,
3578                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3579                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3580                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3581                         goto next_record_is_invalid;
3582                 }
3583         }
3584
3585         /*
3586          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3587          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3588          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3589          * enough for all "normal" records, but very large commit or abort records
3590          * might need more space.)
3591          */
3592         total_len = record->xl_tot_len;
3593         if (total_len > readRecordBufSize)
3594         {
3595                 uint32          newSize = total_len;
3596
3597                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3598                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3599                 if (readRecordBuf)
3600                         free(readRecordBuf);
3601                 readRecordBuf = (char *) malloc(newSize);
3602                 if (!readRecordBuf)
3603                 {
3604                         readRecordBufSize = 0;
3605                         /* We treat this as a "bogus data" condition */
3606                         ereport(emode,
3607                                         (errmsg("record length %u at %X/%X too long",
3608                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3609                         goto next_record_is_invalid;
3610                 }
3611                 readRecordBufSize = newSize;
3612         }
3613
3614         buffer = readRecordBuf;
3615         nextRecord = NULL;
3616         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3617         if (total_len > len)
3618         {
3619                 /* Need to reassemble record */
3620                 XLogContRecord *contrecord;
3621                 uint32          gotlen = len;
3622
3623                 memcpy(buffer, record, len);
3624                 record = (XLogRecord *) buffer;
3625                 buffer += len;
3626                 for (;;)
3627                 {
3628                         readOff += XLOG_BLCKSZ;
3629                         if (readOff >= XLogSegSize)
3630                         {
3631                                 close(readFile);
3632                                 readFile = -1;
3633                                 NextLogSeg(readId, readSeg);
3634                                 readFile = XLogFileRead(readId, readSeg, emode);
3635                                 if (readFile < 0)
3636                                         goto next_record_is_invalid;
3637                                 readOff = 0;
3638                         }
3639                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3640                         {
3641                                 ereport(emode,
3642                                                 (errcode_for_file_access(),
3643                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
3644                                                                 readId, readSeg, readOff)));
3645                                 goto next_record_is_invalid;
3646                         }
3647                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3648                                 goto next_record_is_invalid;
3649                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3650                         {
3651                                 ereport(emode,
3652                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3653                                                                 readId, readSeg, readOff)));
3654                                 goto next_record_is_invalid;
3655                         }
3656                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3657                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3658                         if (contrecord->xl_rem_len == 0 ||
3659                                 total_len != (contrecord->xl_rem_len + gotlen))
3660                         {
3661                                 ereport(emode,
3662                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3663                                                                 contrecord->xl_rem_len,
3664                                                                 readId, readSeg, readOff)));
3665                                 goto next_record_is_invalid;
3666                         }
3667                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3668                         if (contrecord->xl_rem_len > len)
3669                         {
3670                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3671                                 gotlen += len;
3672                                 buffer += len;
3673                                 continue;
3674                         }
3675                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3676                                    contrecord->xl_rem_len);
3677                         break;
3678                 }
3679                 if (!RecordIsValid(record, *RecPtr, emode))
3680                         goto next_record_is_invalid;
3681                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3682                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
3683                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
3684                 {
3685                         nextRecord = (XLogRecord *) ((char *) contrecord +
3686                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
3687                 }
3688                 EndRecPtr.xlogid = readId;
3689                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3690                         pageHeaderSize +
3691                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3692                 ReadRecPtr = *RecPtr;
3693                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3694                 return record;
3695         }
3696
3697         /* Record does not cross a page boundary */
3698         if (!RecordIsValid(record, *RecPtr, emode))
3699                 goto next_record_is_invalid;
3700         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
3701                 MAXALIGN(total_len))
3702                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
3703         EndRecPtr.xlogid = RecPtr->xlogid;
3704         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3705         ReadRecPtr = *RecPtr;
3706         memcpy(buffer, record, total_len);
3707
3708         /*
3709          * Special processing if it's an XLOG SWITCH record
3710          */
3711         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3712         {
3713                 /* Pretend it extends to end of segment */
3714                 EndRecPtr.xrecoff += XLogSegSize - 1;
3715                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3716                 nextRecord = NULL;              /* definitely not on same page */
3717
3718                 /*
3719                  * Pretend that readBuf contains the last page of the segment. This is
3720                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3721                  * segment.
3722                  */
3723                 readOff = XLogSegSize - XLOG_BLCKSZ;
3724         }
3725         return (XLogRecord *) buffer;
3726
3727 next_record_is_invalid:;
3728         if (readFile >= 0)
3729         {
3730                 close(readFile);
3731                 readFile = -1;
3732         }
3733         nextRecord = NULL;
3734         return NULL;
3735 }
3736
3737 /*
3738  * Check whether the xlog header of a page just read in looks valid.
3739  *
3740  * This is just a convenience subroutine to avoid duplicated code in
3741  * ReadRecord.  It's not intended for use from anywhere else.
3742  */
3743 static bool
3744 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3745 {
3746         XLogRecPtr      recaddr;
3747
3748         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3749         {
3750                 ereport(emode,
3751                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3752                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3753                 return false;
3754         }
3755         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3756         {
3757                 ereport(emode,
3758                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3759                                                 hdr->xlp_info, readId, readSeg, readOff)));
3760                 return false;
3761         }
3762         if (hdr->xlp_info & XLP_LONG_HEADER)
3763         {
3764                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3765
3766                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3767                 {
3768                         char            fhdrident_str[32];
3769                         char            sysident_str[32];
3770
3771                         /*
3772                          * Format sysids separately to keep platform-dependent format code
3773                          * out of the translatable message string.
3774                          */
3775                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3776                                          longhdr->xlp_sysid);
3777                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3778                                          ControlFile->system_identifier);
3779                         ereport(emode,
3780                                         (errmsg("WAL file is from different system"),
3781                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
3782                                                            fhdrident_str, sysident_str)));
3783                         return false;
3784                 }
3785                 if (longhdr->xlp_seg_size != XLogSegSize)
3786                 {
3787                         ereport(emode,
3788                                         (errmsg("WAL file is from different system"),
3789                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3790                         return false;
3791                 }
3792                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3793                 {
3794                         ereport(emode,
3795                                         (errmsg("WAL file is from different system"),
3796                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3797                         return false;
3798                 }
3799         }
3800         else if (readOff == 0)
3801         {
3802                 /* hmm, first page of file doesn't have a long header? */
3803                 ereport(emode,
3804                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3805                                                 hdr->xlp_info, readId, readSeg, readOff)));
3806                 return false;
3807         }
3808
3809         recaddr.xlogid = readId;
3810         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3811         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3812         {
3813                 ereport(emode,
3814                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
3815                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3816                                                 readId, readSeg, readOff)));
3817                 return false;
3818         }
3819
3820         /*
3821          * Check page TLI is one of the expected values.
3822          */
3823         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
3824         {
3825                 ereport(emode,
3826                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
3827                                                 hdr->xlp_tli,
3828                                                 readId, readSeg, readOff)));
3829                 return false;
3830         }
3831
3832         /*
3833          * Since child timelines are always assigned a TLI greater than their
3834          * immediate parent's TLI, we should never see TLI go backwards across
3835          * successive pages of a consistent WAL sequence.
3836          *
3837          * Of course this check should only be applied when advancing sequentially
3838          * across pages; therefore ReadRecord resets lastPageTLI to zero when
3839          * going to a random page.
3840          */
3841         if (hdr->xlp_tli < lastPageTLI)
3842         {
3843                 ereport(emode,
3844                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
3845                                                 hdr->xlp_tli, lastPageTLI,
3846                                                 readId, readSeg, readOff)));
3847                 return false;
3848         }
3849         lastPageTLI = hdr->xlp_tli;
3850         return true;
3851 }
3852
3853 /*
3854  * Try to read a timeline's history file.
3855  *
3856  * If successful, return the list of component TLIs (the given TLI followed by
3857  * its ancestor TLIs).  If we can't find the history file, assume that the
3858  * timeline has no parents, and return a list of just the specified timeline
3859  * ID.
3860  */
3861 static List *
3862 readTimeLineHistory(TimeLineID targetTLI)
3863 {
3864         List       *result;
3865         char            path[MAXPGPATH];
3866         char            histfname[MAXFNAMELEN];
3867         char            fline[MAXPGPATH];
3868         FILE       *fd;
3869
3870         if (InArchiveRecovery)
3871         {
3872                 TLHistoryFileName(histfname, targetTLI);
3873                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3874         }
3875         else
3876                 TLHistoryFilePath(path, targetTLI);
3877
3878         fd = AllocateFile(path, "r");
3879         if (fd == NULL)
3880         {
3881                 if (errno != ENOENT)
3882                         ereport(FATAL,
3883                                         (errcode_for_file_access(),
3884                                          errmsg("could not open file \"%s\": %m", path)));
3885                 /* Not there, so assume no parents */
3886                 return list_make1_int((int) targetTLI);
3887         }
3888
3889         result = NIL;
3890
3891         /*
3892          * Parse the file...
3893          */
3894         while (fgets(fline, sizeof(fline), fd) != NULL)
3895         {
3896                 /* skip leading whitespace and check for # comment */
3897                 char       *ptr;
3898                 char       *endptr;
3899                 TimeLineID      tli;
3900
3901                 for (ptr = fline; *ptr; ptr++)
3902                 {
3903                         if (!isspace((unsigned char) *ptr))
3904                                 break;
3905                 }
3906                 if (*ptr == '\0' || *ptr == '#')
3907                         continue;
3908
3909                 /* expect a numeric timeline ID as first field of line */
3910                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
3911                 if (endptr == ptr)
3912                         ereport(FATAL,
3913                                         (errmsg("syntax error in history file: %s", fline),
3914                                          errhint("Expected a numeric timeline ID.")));
3915
3916                 if (result &&
3917                         tli <= (TimeLineID) linitial_int(result))
3918                         ereport(FATAL,
3919                                         (errmsg("invalid data in history file: %s", fline),
3920                                    errhint("Timeline IDs must be in increasing sequence.")));
3921
3922                 /* Build list with newest item first */
3923                 result = lcons_int((int) tli, result);
3924
3925                 /* we ignore the remainder of each line */
3926         }
3927
3928         FreeFile(fd);
3929
3930         if (result &&
3931                 targetTLI <= (TimeLineID) linitial_int(result))
3932                 ereport(FATAL,
3933                                 (errmsg("invalid data in history file \"%s\"", path),
3934                         errhint("Timeline IDs must be less than child timeline's ID.")));
3935
3936         result = lcons_int((int) targetTLI, result);
3937
3938         ereport(DEBUG3,
3939                         (errmsg_internal("history of timeline %u is %s",
3940                                                          targetTLI, nodeToString(result))));
3941
3942         return result;
3943 }
3944
3945 /*
3946  * Probe whether a timeline history file exists for the given timeline ID
3947  */
3948 static bool
3949 existsTimeLineHistory(TimeLineID probeTLI)
3950 {
3951         char            path[MAXPGPATH];
3952         char            histfname[MAXFNAMELEN];
3953         FILE       *fd;
3954
3955         if (InArchiveRecovery)
3956         {
3957                 TLHistoryFileName(histfname, probeTLI);
3958                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3959         }
3960         else
3961                 TLHistoryFilePath(path, probeTLI);
3962
3963         fd = AllocateFile(path, "r");
3964         if (fd != NULL)
3965         {
3966                 FreeFile(fd);
3967                 return true;
3968         }
3969         else
3970         {
3971                 if (errno != ENOENT)
3972                         ereport(FATAL,
3973                                         (errcode_for_file_access(),
3974                                          errmsg("could not open file \"%s\": %m", path)));
3975                 return false;
3976         }
3977 }
3978
3979 /*
3980  * Find the newest existing timeline, assuming that startTLI exists.
3981  *
3982  * Note: while this is somewhat heuristic, it does positively guarantee
3983  * that (result + 1) is not a known timeline, and therefore it should
3984  * be safe to assign that ID to a new timeline.
3985  */
3986 static TimeLineID
3987 findNewestTimeLine(TimeLineID startTLI)
3988 {
3989         TimeLineID      newestTLI;
3990         TimeLineID      probeTLI;
3991
3992         /*
3993          * The algorithm is just to probe for the existence of timeline history
3994          * files.  XXX is it useful to allow gaps in the sequence?
3995          */
3996         newestTLI = startTLI;
3997
3998         for (probeTLI = startTLI + 1;; probeTLI++)
3999         {
4000                 if (existsTimeLineHistory(probeTLI))
4001                 {
4002                         newestTLI = probeTLI;           /* probeTLI exists */
4003                 }
4004                 else
4005                 {
4006                         /* doesn't exist, assume we're done */
4007                         break;
4008                 }
4009         }
4010
4011         return newestTLI;
4012 }
4013
4014 /*
4015  * Create a new timeline history file.
4016  *
4017  *      newTLI: ID of the new timeline
4018  *      parentTLI: ID of its immediate parent
4019  *      endTLI et al: ID of the last used WAL file, for annotation purposes
4020  *
4021  * Currently this is only used during recovery, and so there are no locking
4022  * considerations.      But we should be just as tense as XLogFileInit to avoid
4023  * emplacing a bogus file.
4024  */
4025 static void
4026 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
4027                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4028 {
4029         char            path[MAXPGPATH];
4030         char            tmppath[MAXPGPATH];
4031         char            histfname[MAXFNAMELEN];
4032         char            xlogfname[MAXFNAMELEN];
4033         char            buffer[BLCKSZ];
4034         int                     srcfd;
4035         int                     fd;
4036         int                     nbytes;
4037
4038         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4039
4040         /*
4041          * Write into a temp file name.
4042          */
4043         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4044
4045         unlink(tmppath);
4046
4047         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
4048         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
4049                                            S_IRUSR | S_IWUSR);
4050         if (fd < 0)
4051                 ereport(ERROR,
4052                                 (errcode_for_file_access(),
4053                                  errmsg("could not create file \"%s\": %m", tmppath)));
4054
4055         /*
4056          * If a history file exists for the parent, copy it verbatim
4057          */
4058         if (InArchiveRecovery)
4059         {
4060                 TLHistoryFileName(histfname, parentTLI);
4061                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4062         }
4063         else
4064                 TLHistoryFilePath(path, parentTLI);
4065
4066         srcfd = BasicOpenFile(path, O_RDONLY, 0);
4067         if (srcfd < 0)
4068         {
4069                 if (errno != ENOENT)
4070                         ereport(ERROR,
4071                                         (errcode_for_file_access(),
4072                                          errmsg("could not open file \"%s\": %m", path)));
4073                 /* Not there, so assume parent has no parents */
4074         }
4075         else
4076         {
4077                 for (;;)
4078                 {
4079                         errno = 0;
4080                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
4081                         if (nbytes < 0 || errno != 0)
4082                                 ereport(ERROR,
4083                                                 (errcode_for_file_access(),
4084                                                  errmsg("could not read file \"%s\": %m", path)));
4085                         if (nbytes == 0)
4086                                 break;
4087                         errno = 0;
4088                         if ((int) write(fd, buffer, nbytes) != nbytes)
4089                         {
4090                                 int                     save_errno = errno;
4091
4092                                 /*
4093                                  * If we fail to make the file, delete it to release disk
4094                                  * space
4095                                  */
4096                                 unlink(tmppath);
4097
4098                                 /*
4099                                  * if write didn't set errno, assume problem is no disk space
4100                                  */
4101                                 errno = save_errno ? save_errno : ENOSPC;
4102
4103                                 ereport(ERROR,
4104                                                 (errcode_for_file_access(),
4105                                          errmsg("could not write to file \"%s\": %m", tmppath)));
4106                         }
4107                 }
4108                 close(srcfd);
4109         }
4110
4111         /*
4112          * Append one line with the details of this timeline split.
4113          *
4114          * If we did have a parent file, insert an extra newline just in case the
4115          * parent file failed to end with one.
4116          */
4117         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
4118
4119         snprintf(buffer, sizeof(buffer),
4120                          "%s%u\t%s\t%s transaction %u at %s\n",
4121                          (srcfd < 0) ? "" : "\n",
4122                          parentTLI,
4123                          xlogfname,
4124                          recoveryStopAfter ? "after" : "before",
4125                          recoveryStopXid,
4126                          timestamptz_to_str(recoveryStopTime));
4127
4128         nbytes = strlen(buffer);
4129         errno = 0;
4130         if ((int) write(fd, buffer, nbytes) != nbytes)
4131         {
4132                 int                     save_errno = errno;
4133
4134                 /*
4135                  * If we fail to make the file, delete it to release disk space
4136                  */
4137                 unlink(tmppath);
4138                 /* if write didn't set errno, assume problem is no disk space */
4139                 errno = save_errno ? save_errno : ENOSPC;
4140
4141                 ereport(ERROR,
4142                                 (errcode_for_file_access(),
4143                                  errmsg("could not write to file \"%s\": %m", tmppath)));
4144         }
4145
4146         if (pg_fsync(fd) != 0)
4147                 ereport(ERROR,
4148                                 (errcode_for_file_access(),
4149                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
4150
4151         if (close(fd))
4152                 ereport(ERROR,
4153                                 (errcode_for_file_access(),
4154                                  errmsg("could not close file \"%s\": %m", tmppath)));
4155
4156
4157         /*
4158          * Now move the completed history file into place with its final name.
4159          */
4160         TLHistoryFilePath(path, newTLI);
4161
4162         /*
4163          * Prefer link() to rename() here just to be really sure that we don't
4164          * overwrite an existing logfile.  However, there shouldn't be one, so
4165          * rename() is an acceptable substitute except for the truly paranoid.
4166          */
4167 #if HAVE_WORKING_LINK
4168         if (link(tmppath, path) < 0)
4169                 ereport(ERROR,
4170                                 (errcode_for_file_access(),
4171                                  errmsg("could not link file \"%s\" to \"%s\": %m",
4172                                                 tmppath, path)));
4173         unlink(tmppath);
4174 #else
4175         if (rename(tmppath, path) < 0)
4176                 ereport(ERROR,
4177                                 (errcode_for_file_access(),
4178                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4179                                                 tmppath, path)));
4180 #endif
4181
4182         /* The history file can be archived immediately. */
4183         TLHistoryFileName(histfname, newTLI);
4184         XLogArchiveNotify(histfname);
4185 }
4186
4187 /*
4188  * I/O routines for pg_control
4189  *
4190  * *ControlFile is a buffer in shared memory that holds an image of the
4191  * contents of pg_control.      WriteControlFile() initializes pg_control
4192  * given a preloaded buffer, ReadControlFile() loads the buffer from
4193  * the pg_control file (during postmaster or standalone-backend startup),
4194  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4195  *
4196  * For simplicity, WriteControlFile() initializes the fields of pg_control
4197  * that are related to checking backend/database compatibility, and
4198  * ReadControlFile() verifies they are correct.  We could split out the
4199  * I/O and compatibility-check functions, but there seems no need currently.
4200  */
4201 static void
4202 WriteControlFile(void)
4203 {
4204         int                     fd;
4205         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4206
4207         /*
4208          * Initialize version and compatibility-check fields
4209          */
4210         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4211         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4212
4213         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4214         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4215
4216         ControlFile->blcksz = BLCKSZ;
4217         ControlFile->relseg_size = RELSEG_SIZE;
4218         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4219         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4220
4221         ControlFile->nameDataLen = NAMEDATALEN;
4222         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4223
4224         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4225
4226 #ifdef HAVE_INT64_TIMESTAMP
4227         ControlFile->enableIntTimes = true;
4228 #else
4229         ControlFile->enableIntTimes = false;
4230 #endif
4231         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4232         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4233
4234         /* Contents are protected with a CRC */
4235         INIT_CRC32(ControlFile->crc);
4236         COMP_CRC32(ControlFile->crc,
4237                            (char *) ControlFile,
4238                            offsetof(ControlFileData, crc));
4239         FIN_CRC32(ControlFile->crc);
4240
4241         /*
4242          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4243          * excess over sizeof(ControlFileData).  This reduces the odds of
4244          * premature-EOF errors when reading pg_control.  We'll still fail when we
4245          * check the contents of the file, but hopefully with a more specific
4246          * error than "couldn't read pg_control".
4247          */
4248         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4249                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4250
4251         memset(buffer, 0, PG_CONTROL_SIZE);
4252         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4253
4254         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4255                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4256                                            S_IRUSR | S_IWUSR);
4257         if (fd < 0)
4258                 ereport(PANIC,
4259                                 (errcode_for_file_access(),
4260                                  errmsg("could not create control file \"%s\": %m",
4261                                                 XLOG_CONTROL_FILE)));
4262
4263         errno = 0;
4264         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4265         {
4266                 /* if write didn't set errno, assume problem is no disk space */
4267                 if (errno == 0)
4268                         errno = ENOSPC;
4269                 ereport(PANIC,
4270                                 (errcode_for_file_access(),
4271                                  errmsg("could not write to control file: %m")));
4272         }
4273
4274         if (pg_fsync(fd) != 0)
4275                 ereport(PANIC,
4276                                 (errcode_for_file_access(),
4277                                  errmsg("could not fsync control file: %m")));
4278
4279         if (close(fd))
4280                 ereport(PANIC,
4281                                 (errcode_for_file_access(),
4282                                  errmsg("could not close control file: %m")));
4283 }
4284
4285 static void
4286 ReadControlFile(void)
4287 {
4288         pg_crc32        crc;
4289         int                     fd;
4290
4291         /*
4292          * Read data...
4293          */
4294         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4295                                            O_RDWR | PG_BINARY,
4296                                            S_IRUSR | S_IWUSR);
4297         if (fd < 0)
4298                 ereport(PANIC,
4299                                 (errcode_for_file_access(),
4300                                  errmsg("could not open control file \"%s\": %m",
4301                                                 XLOG_CONTROL_FILE)));
4302
4303         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4304                 ereport(PANIC,
4305                                 (errcode_for_file_access(),
4306                                  errmsg("could not read from control file: %m")));
4307
4308         close(fd);
4309
4310         /*
4311          * Check for expected pg_control format version.  If this is wrong, the
4312          * CRC check will likely fail because we'll be checking the wrong number
4313          * of bytes.  Complaining about wrong version will probably be more
4314          * enlightening than complaining about wrong CRC.
4315          */
4316
4317         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4318                 ereport(FATAL,
4319                                 (errmsg("database files are incompatible with server"),
4320                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4321                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4322                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4323                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4324                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4325
4326         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4327                 ereport(FATAL,
4328                                 (errmsg("database files are incompatible with server"),
4329                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4330                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4331                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4332                                  errhint("It looks like you need to initdb.")));
4333
4334         /* Now check the CRC. */
4335         INIT_CRC32(crc);
4336         COMP_CRC32(crc,
4337                            (char *) ControlFile,
4338                            offsetof(ControlFileData, crc));
4339         FIN_CRC32(crc);
4340
4341         if (!EQ_CRC32(crc, ControlFile->crc))
4342                 ereport(FATAL,
4343                                 (errmsg("incorrect checksum in control file")));
4344
4345         /*
4346          * Do compatibility checking immediately.  If the database isn't
4347          * compatible with the backend executable, we want to abort before we can
4348          * possibly do any damage.
4349          */
4350         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4351                 ereport(FATAL,
4352                                 (errmsg("database files are incompatible with server"),
4353                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4354                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4355                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4356                                  errhint("It looks like you need to initdb.")));
4357         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4358                 ereport(FATAL,
4359                                 (errmsg("database files are incompatible with server"),
4360                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4361                                          " but the server was compiled with MAXALIGN %d.",
4362                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4363                                  errhint("It looks like you need to initdb.")));
4364         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4365                 ereport(FATAL,
4366                                 (errmsg("database files are incompatible with server"),
4367                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4368                                  errhint("It looks like you need to initdb.")));
4369         if (ControlFile->blcksz != BLCKSZ)
4370                 ereport(FATAL,
4371                                 (errmsg("database files are incompatible with server"),
4372                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4373                                            " but the server was compiled with BLCKSZ %d.",
4374                                            ControlFile->blcksz, BLCKSZ),
4375                                  errhint("It looks like you need to recompile or initdb.")));
4376         if (ControlFile->relseg_size != RELSEG_SIZE)
4377                 ereport(FATAL,
4378                                 (errmsg("database files are incompatible with server"),
4379                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4380                                   " but the server was compiled with RELSEG_SIZE %d.",
4381                                   ControlFile->relseg_size, RELSEG_SIZE),
4382                                  errhint("It looks like you need to recompile or initdb.")));
4383         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4384                 ereport(FATAL,
4385                                 (errmsg("database files are incompatible with server"),
4386                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4387                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4388                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4389                                  errhint("It looks like you need to recompile or initdb.")));
4390         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4391                 ereport(FATAL,
4392                                 (errmsg("database files are incompatible with server"),
4393                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4394                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4395                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4396                                  errhint("It looks like you need to recompile or initdb.")));
4397         if (ControlFile->nameDataLen != NAMEDATALEN)
4398                 ereport(FATAL,
4399                                 (errmsg("database files are incompatible with server"),
4400                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4401                                   " but the server was compiled with NAMEDATALEN %d.",
4402                                   ControlFile->nameDataLen, NAMEDATALEN),
4403                                  errhint("It looks like you need to recompile or initdb.")));
4404         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4405                 ereport(FATAL,
4406                                 (errmsg("database files are incompatible with server"),
4407                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4408                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4409                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4410                                  errhint("It looks like you need to recompile or initdb.")));
4411         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4412                 ereport(FATAL,
4413                                 (errmsg("database files are incompatible with server"),
4414                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4415                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4416                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4417                                  errhint("It looks like you need to recompile or initdb.")));
4418
4419 #ifdef HAVE_INT64_TIMESTAMP
4420         if (ControlFile->enableIntTimes != true)
4421                 ereport(FATAL,
4422                                 (errmsg("database files are incompatible with server"),
4423                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4424                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4425                                  errhint("It looks like you need to recompile or initdb.")));
4426 #else
4427         if (ControlFile->enableIntTimes != false)
4428                 ereport(FATAL,
4429                                 (errmsg("database files are incompatible with server"),
4430                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4431                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4432                                  errhint("It looks like you need to recompile or initdb.")));
4433 #endif
4434
4435 #ifdef USE_FLOAT4_BYVAL
4436         if (ControlFile->float4ByVal != true)
4437                 ereport(FATAL,
4438                                 (errmsg("database files are incompatible with server"),
4439                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4440                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4441                                  errhint("It looks like you need to recompile or initdb.")));
4442 #else
4443         if (ControlFile->float4ByVal != false)
4444                 ereport(FATAL,
4445                                 (errmsg("database files are incompatible with server"),
4446                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4447                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4448                                  errhint("It looks like you need to recompile or initdb.")));
4449 #endif
4450
4451 #ifdef USE_FLOAT8_BYVAL
4452         if (ControlFile->float8ByVal != true)
4453                 ereport(FATAL,
4454                                 (errmsg("database files are incompatible with server"),
4455                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4456                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4457                                  errhint("It looks like you need to recompile or initdb.")));
4458 #else
4459         if (ControlFile->float8ByVal != false)
4460                 ereport(FATAL,
4461                                 (errmsg("database files are incompatible with server"),
4462                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4463                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4464                                  errhint("It looks like you need to recompile or initdb.")));
4465 #endif
4466 }
4467
4468 void
4469 UpdateControlFile(void)
4470 {
4471         int                     fd;
4472
4473         INIT_CRC32(ControlFile->crc);
4474         COMP_CRC32(ControlFile->crc,
4475                            (char *) ControlFile,
4476                            offsetof(ControlFileData, crc));
4477         FIN_CRC32(ControlFile->crc);
4478
4479         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4480                                            O_RDWR | PG_BINARY,
4481                                            S_IRUSR | S_IWUSR);
4482         if (fd < 0)
4483                 ereport(PANIC,
4484                                 (errcode_for_file_access(),
4485                                  errmsg("could not open control file \"%s\": %m",
4486                                                 XLOG_CONTROL_FILE)));
4487
4488         errno = 0;
4489         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4490         {
4491                 /* if write didn't set errno, assume problem is no disk space */
4492                 if (errno == 0)
4493                         errno = ENOSPC;
4494                 ereport(PANIC,
4495                                 (errcode_for_file_access(),
4496                                  errmsg("could not write to control file: %m")));
4497         }
4498
4499         if (pg_fsync(fd) != 0)
4500                 ereport(PANIC,
4501                                 (errcode_for_file_access(),
4502                                  errmsg("could not fsync control file: %m")));
4503
4504         if (close(fd))
4505                 ereport(PANIC,
4506                                 (errcode_for_file_access(),
4507                                  errmsg("could not close control file: %m")));
4508 }
4509
4510 /*
4511  * Initialization of shared memory for XLOG
4512  */
4513 Size
4514 XLOGShmemSize(void)
4515 {
4516         Size            size;
4517
4518         /* XLogCtl */
4519         size = sizeof(XLogCtlData);
4520         /* xlblocks array */
4521         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4522         /* extra alignment padding for XLOG I/O buffers */
4523         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4524         /* and the buffers themselves */
4525         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4526
4527         /*
4528          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4529          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4530          * routine again below to compute the actual allocation size.
4531          */
4532
4533         return size;
4534 }
4535
4536 void
4537 XLOGShmemInit(void)
4538 {
4539         bool            foundCFile,
4540                                 foundXLog;
4541         char       *allocptr;
4542
4543         ControlFile = (ControlFileData *)
4544                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4545         XLogCtl = (XLogCtlData *)
4546                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4547
4548         if (foundCFile || foundXLog)
4549         {
4550                 /* both should be present or neither */
4551                 Assert(foundCFile && foundXLog);
4552                 return;
4553         }
4554
4555         memset(XLogCtl, 0, sizeof(XLogCtlData));
4556
4557         /*
4558          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4559          * multiple of the alignment for same, so no extra alignment padding is
4560          * needed here.
4561          */
4562         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4563         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4564         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4565         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4566
4567         /*
4568          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4569          */
4570         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4571         XLogCtl->pages = allocptr;
4572         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4573
4574         /*
4575          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4576          * in additional info.)
4577          */
4578         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4579         XLogCtl->SharedRecoveryInProgress = true;
4580         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4581         SpinLockInit(&XLogCtl->info_lck);
4582
4583         /*
4584          * If we are not in bootstrap mode, pg_control should already exist. Read
4585          * and validate it immediately (see comments in ReadControlFile() for the
4586          * reasons why).
4587          */
4588         if (!IsBootstrapProcessingMode())
4589                 ReadControlFile();
4590 }
4591
4592 /*
4593  * This func must be called ONCE on system install.  It creates pg_control
4594  * and the initial XLOG segment.
4595  */
4596 void
4597 BootStrapXLOG(void)
4598 {
4599         CheckPoint      checkPoint;
4600         char       *buffer;
4601         XLogPageHeader page;
4602         XLogLongPageHeader longpage;
4603         XLogRecord *record;
4604         bool            use_existent;
4605         uint64          sysidentifier;
4606         struct timeval tv;
4607         pg_crc32        crc;
4608
4609         /*
4610          * Select a hopefully-unique system identifier code for this installation.
4611          * We use the result of gettimeofday(), including the fractional seconds
4612          * field, as being about as unique as we can easily get.  (Think not to
4613          * use random(), since it hasn't been seeded and there's no portable way
4614          * to seed it other than the system clock value...)  The upper half of the
4615          * uint64 value is just the tv_sec part, while the lower half is the XOR
4616          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4617          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4618          * knowing this encoding can determine the initialization time of the
4619          * installation, which could perhaps be useful sometimes.
4620          */
4621         gettimeofday(&tv, NULL);
4622         sysidentifier = ((uint64) tv.tv_sec) << 32;
4623         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4624
4625         /* First timeline ID is always 1 */
4626         ThisTimeLineID = 1;
4627
4628         /* page buffer must be aligned suitably for O_DIRECT */
4629         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4630         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4631         memset(page, 0, XLOG_BLCKSZ);
4632
4633         /* Set up information for the initial checkpoint record */
4634         checkPoint.redo.xlogid = 0;
4635         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
4636         checkPoint.ThisTimeLineID = ThisTimeLineID;
4637         checkPoint.nextXidEpoch = 0;
4638         checkPoint.nextXid = FirstNormalTransactionId;
4639         checkPoint.nextOid = FirstBootstrapObjectId;
4640         checkPoint.nextMulti = FirstMultiXactId;
4641         checkPoint.nextMultiOffset = 0;
4642         checkPoint.oldestXid = FirstNormalTransactionId;
4643         checkPoint.oldestXidDB = TemplateDbOid;
4644         checkPoint.time = (pg_time_t) time(NULL);
4645
4646         ShmemVariableCache->nextXid = checkPoint.nextXid;
4647         ShmemVariableCache->nextOid = checkPoint.nextOid;
4648         ShmemVariableCache->oidCount = 0;
4649         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4650         ShmemVariableCache->oldestXid = checkPoint.oldestXid;
4651         ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
4652
4653         /* Set up the XLOG page header */
4654         page->xlp_magic = XLOG_PAGE_MAGIC;
4655         page->xlp_info = XLP_LONG_HEADER;
4656         page->xlp_tli = ThisTimeLineID;
4657         page->xlp_pageaddr.xlogid = 0;
4658         page->xlp_pageaddr.xrecoff = 0;
4659         longpage = (XLogLongPageHeader) page;
4660         longpage->xlp_sysid = sysidentifier;
4661         longpage->xlp_seg_size = XLogSegSize;
4662         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4663
4664         /* Insert the initial checkpoint record */
4665         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4666         record->xl_prev.xlogid = 0;
4667         record->xl_prev.xrecoff = 0;
4668         record->xl_xid = InvalidTransactionId;
4669         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4670         record->xl_len = sizeof(checkPoint);
4671         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4672         record->xl_rmid = RM_XLOG_ID;
4673         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4674
4675         INIT_CRC32(crc);
4676         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4677         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4678                            SizeOfXLogRecord - sizeof(pg_crc32));
4679         FIN_CRC32(crc);
4680         record->xl_crc = crc;
4681
4682         /* Create first XLOG segment file */
4683         use_existent = false;
4684         openLogFile = XLogFileInit(0, 0, &use_existent, false);
4685
4686         /* Write the first page with the initial record */
4687         errno = 0;
4688         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4689         {
4690                 /* if write didn't set errno, assume problem is no disk space */
4691                 if (errno == 0)
4692                         errno = ENOSPC;
4693                 ereport(PANIC,
4694                                 (errcode_for_file_access(),
4695                           errmsg("could not write bootstrap transaction log file: %m")));
4696         }
4697
4698         if (pg_fsync(openLogFile) != 0)
4699                 ereport(PANIC,
4700                                 (errcode_for_file_access(),
4701                           errmsg("could not fsync bootstrap transaction log file: %m")));
4702
4703         if (close(openLogFile))
4704                 ereport(PANIC,
4705                                 (errcode_for_file_access(),
4706                           errmsg("could not close bootstrap transaction log file: %m")));
4707
4708         openLogFile = -1;
4709
4710         /* Now create pg_control */
4711
4712         memset(ControlFile, 0, sizeof(ControlFileData));
4713         /* Initialize pg_control status fields */
4714         ControlFile->system_identifier = sysidentifier;
4715         ControlFile->state = DB_SHUTDOWNED;
4716         ControlFile->time = checkPoint.time;
4717         ControlFile->checkPoint = checkPoint.redo;
4718         ControlFile->checkPointCopy = checkPoint;
4719         /* some additional ControlFile fields are set in WriteControlFile() */
4720
4721         WriteControlFile();
4722
4723         /* Bootstrap the commit log, too */
4724         BootStrapCLOG();
4725         BootStrapSUBTRANS();
4726         BootStrapMultiXact();
4727
4728         pfree(buffer);
4729 }
4730
4731 static char *
4732 str_time(pg_time_t tnow)
4733 {
4734         static char buf[128];
4735
4736         pg_strftime(buf, sizeof(buf),
4737                                 "%Y-%m-%d %H:%M:%S %Z",
4738                                 pg_localtime(&tnow, log_timezone));
4739
4740         return buf;
4741 }
4742
4743 /*
4744  * See if there is a recovery command file (recovery.conf), and if so
4745  * read in parameters for archive recovery.
4746  *
4747  * XXX longer term intention is to expand this to
4748  * cater for additional parameters and controls
4749  * possibly use a flex lexer similar to the GUC one
4750  */
4751 static void
4752 readRecoveryCommandFile(void)
4753 {
4754         FILE       *fd;
4755         char            cmdline[MAXPGPATH];
4756         TimeLineID      rtli = 0;
4757         bool            rtliGiven = false;
4758         bool            syntaxError = false;
4759
4760         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4761         if (fd == NULL)
4762         {
4763                 if (errno == ENOENT)
4764                         return;                         /* not there, so no archive recovery */
4765                 ereport(FATAL,
4766                                 (errcode_for_file_access(),
4767                                  errmsg("could not open recovery command file \"%s\": %m",
4768                                                 RECOVERY_COMMAND_FILE)));
4769         }
4770
4771         ereport(LOG,
4772                         (errmsg("starting archive recovery")));
4773
4774         /*
4775          * Parse the file...
4776          */
4777         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4778         {
4779                 /* skip leading whitespace and check for # comment */
4780                 char       *ptr;
4781                 char       *tok1;
4782                 char       *tok2;
4783
4784                 for (ptr = cmdline; *ptr; ptr++)
4785                 {
4786                         if (!isspace((unsigned char) *ptr))
4787                                 break;
4788                 }
4789                 if (*ptr == '\0' || *ptr == '#')
4790                         continue;
4791
4792                 /* identify the quoted parameter value */
4793                 tok1 = strtok(ptr, "'");
4794                 if (!tok1)
4795                 {
4796                         syntaxError = true;
4797                         break;
4798                 }
4799                 tok2 = strtok(NULL, "'");
4800                 if (!tok2)
4801                 {
4802                         syntaxError = true;
4803                         break;
4804                 }
4805                 /* reparse to get just the parameter name */
4806                 tok1 = strtok(ptr, " \t=");
4807                 if (!tok1)
4808                 {
4809                         syntaxError = true;
4810                         break;
4811                 }
4812
4813                 if (strcmp(tok1, "restore_command") == 0)
4814                 {
4815                         recoveryRestoreCommand = pstrdup(tok2);
4816                         ereport(LOG,
4817                                         (errmsg("restore_command = '%s'",
4818                                                         recoveryRestoreCommand)));
4819                 }
4820                 else if (strcmp(tok1, "recovery_end_command") == 0)
4821                 {
4822                         recoveryEndCommand = pstrdup(tok2);
4823                         ereport(LOG,
4824                                         (errmsg("recovery_end_command = '%s'",
4825                                                         recoveryEndCommand)));
4826                 }
4827                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
4828                 {
4829                         rtliGiven = true;
4830                         if (strcmp(tok2, "latest") == 0)
4831                                 rtli = 0;
4832                         else
4833                         {
4834                                 errno = 0;
4835                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
4836                                 if (errno == EINVAL || errno == ERANGE)
4837                                         ereport(FATAL,
4838                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4839                                                                         tok2)));
4840                         }
4841                         if (rtli)
4842                                 ereport(LOG,
4843                                                 (errmsg("recovery_target_timeline = %u", rtli)));
4844                         else
4845                                 ereport(LOG,
4846                                                 (errmsg("recovery_target_timeline = latest")));
4847                 }
4848                 else if (strcmp(tok1, "recovery_target_xid") == 0)
4849                 {
4850                         errno = 0;
4851                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
4852                         if (errno == EINVAL || errno == ERANGE)
4853                                 ereport(FATAL,
4854                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4855                                                  tok2)));
4856                         ereport(LOG,
4857                                         (errmsg("recovery_target_xid = %u",
4858                                                         recoveryTargetXid)));
4859                         recoveryTarget = true;
4860                         recoveryTargetExact = true;
4861                 }
4862                 else if (strcmp(tok1, "recovery_target_time") == 0)
4863                 {
4864                         /*
4865                          * if recovery_target_xid specified, then this overrides
4866                          * recovery_target_time
4867                          */
4868                         if (recoveryTargetExact)
4869                                 continue;
4870                         recoveryTarget = true;
4871                         recoveryTargetExact = false;
4872
4873                         /*
4874                          * Convert the time string given by the user to TimestampTz form.
4875                          */
4876                         recoveryTargetTime =
4877                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4878                                                                                                                 CStringGetDatum(tok2),
4879                                                                                                 ObjectIdGetDatum(InvalidOid),
4880                                                                                                                 Int32GetDatum(-1)));
4881                         ereport(LOG,
4882                                         (errmsg("recovery_target_time = '%s'",
4883                                                         timestamptz_to_str(recoveryTargetTime))));
4884                 }
4885                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
4886                 {
4887                         /*
4888                          * does nothing if a recovery_target is not also set
4889                          */
4890                         if (!parse_bool(tok2, &recoveryTargetInclusive))
4891                                 ereport(ERROR,
4892                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4893                                                  errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4894                         ereport(LOG,
4895                                         (errmsg("recovery_target_inclusive = %s", tok2)));
4896                 }
4897                 else
4898                         ereport(FATAL,
4899                                         (errmsg("unrecognized recovery parameter \"%s\"",
4900                                                         tok1)));
4901         }
4902
4903         FreeFile(fd);
4904
4905         if (syntaxError)
4906                 ereport(FATAL,
4907                                 (errmsg("syntax error in recovery command file: %s",
4908                                                 cmdline),
4909                           errhint("Lines should have the format parameter = 'value'.")));
4910
4911         /* Check that required parameters were supplied */
4912         if (recoveryRestoreCommand == NULL)
4913                 ereport(FATAL,
4914                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
4915                                                 RECOVERY_COMMAND_FILE)));
4916
4917         /* Enable fetching from archive recovery area */
4918         InArchiveRecovery = true;
4919
4920         /*
4921          * If user specified recovery_target_timeline, validate it or compute the
4922          * "latest" value.      We can't do this until after we've gotten the restore
4923          * command and set InArchiveRecovery, because we need to fetch timeline
4924          * history files from the archive.
4925          */
4926         if (rtliGiven)
4927         {
4928                 if (rtli)
4929                 {
4930                         /* Timeline 1 does not have a history file, all else should */
4931                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4932                                 ereport(FATAL,
4933                                                 (errmsg("recovery target timeline %u does not exist",
4934                                                                 rtli)));
4935                         recoveryTargetTLI = rtli;
4936                 }
4937                 else
4938                 {
4939                         /* We start the "latest" search from pg_control's timeline */
4940                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4941                 }
4942         }
4943 }
4944
4945 /*
4946  * Exit archive-recovery state
4947  */
4948 static void
4949 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4950 {
4951         char            recoveryPath[MAXPGPATH];
4952         char            xlogpath[MAXPGPATH];
4953         XLogRecPtr      InvalidXLogRecPtr = {0, 0};
4954
4955         /*
4956          * We are no longer in archive recovery state.
4957          */
4958         InArchiveRecovery = false;
4959
4960         /*
4961          * Update min recovery point one last time.
4962          */
4963         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4964
4965         /*
4966          * We should have the ending log segment currently open.  Verify, and then
4967          * close it (to avoid problems on Windows with trying to rename or delete
4968          * an open file).
4969          */
4970         Assert(readFile >= 0);
4971         Assert(readId == endLogId);
4972         Assert(readSeg == endLogSeg);
4973
4974         close(readFile);
4975         readFile = -1;
4976
4977         /*
4978          * If the segment was fetched from archival storage, we want to replace
4979          * the existing xlog segment (if any) with the archival version.  This is
4980          * because whatever is in XLOGDIR is very possibly older than what we have
4981          * from the archives, since it could have come from restoring a PGDATA
4982          * backup.      In any case, the archival version certainly is more
4983          * descriptive of what our current database state is, because that is what
4984          * we replayed from.
4985          *
4986          * Note that if we are establishing a new timeline, ThisTimeLineID is
4987          * already set to the new value, and so we will create a new file instead
4988          * of overwriting any existing file.  (This is, in fact, always the case
4989          * at present.)
4990          */
4991         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4992         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4993
4994         if (restoredFromArchive)
4995         {
4996                 ereport(DEBUG3,
4997                                 (errmsg_internal("moving last restored xlog to \"%s\"",
4998                                                                  xlogpath)));
4999                 unlink(xlogpath);               /* might or might not exist */
5000                 if (rename(recoveryPath, xlogpath) != 0)
5001                         ereport(FATAL,
5002                                         (errcode_for_file_access(),
5003                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
5004                                                         recoveryPath, xlogpath)));
5005                 /* XXX might we need to fix permissions on the file? */
5006         }
5007         else
5008         {
5009                 /*
5010                  * If the latest segment is not archival, but there's still a
5011                  * RECOVERYXLOG laying about, get rid of it.
5012                  */
5013                 unlink(recoveryPath);   /* ignore any error */
5014
5015                 /*
5016                  * If we are establishing a new timeline, we have to copy data from
5017                  * the last WAL segment of the old timeline to create a starting WAL
5018                  * segment for the new timeline.
5019                  *
5020                  * Notify the archiver that the last WAL segment of the old timeline
5021                  * is ready to copy to archival storage. Otherwise, it is not archived
5022                  * for a while.
5023                  */
5024                 if (endTLI != ThisTimeLineID)
5025                 {
5026                         XLogFileCopy(endLogId, endLogSeg,
5027                                                  endTLI, endLogId, endLogSeg);
5028
5029                         if (XLogArchivingActive())
5030                         {
5031                                 XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
5032                                 XLogArchiveNotify(xlogpath);
5033                         }
5034                 }
5035         }
5036
5037         /*
5038          * Let's just make real sure there are not .ready or .done flags posted
5039          * for the new segment.
5040          */
5041         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5042         XLogArchiveCleanup(xlogpath);
5043
5044         /* Get rid of any remaining recovered timeline-history file, too */
5045         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5046         unlink(recoveryPath);           /* ignore any error */
5047
5048         /*
5049          * Rename the config file out of the way, so that we don't accidentally
5050          * re-enter archive recovery mode in a subsequent crash.
5051          */
5052         unlink(RECOVERY_COMMAND_DONE);
5053         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5054                 ereport(FATAL,
5055                                 (errcode_for_file_access(),
5056                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5057                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5058
5059         ereport(LOG,
5060                         (errmsg("archive recovery complete")));
5061 }
5062
5063 /*
5064  * For point-in-time recovery, this function decides whether we want to
5065  * stop applying the XLOG at or after the current record.
5066  *
5067  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5068  * *includeThis is set TRUE if we should apply this record before stopping.
5069  *
5070  * We also track the timestamp of the latest applied COMMIT/ABORT record
5071  * in recoveryLastXTime, for logging purposes.
5072  * Also, some information is saved in recoveryStopXid et al for use in
5073  * annotating the new timeline's history file.
5074  */
5075 static bool
5076 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5077 {
5078         bool            stopsHere;
5079         uint8           record_info;
5080         TimestampTz recordXtime;
5081
5082         /* We only consider stopping at COMMIT or ABORT records */
5083         if (record->xl_rmid != RM_XACT_ID)
5084                 return false;
5085         record_info = record->xl_info & ~XLR_INFO_MASK;
5086         if (record_info == XLOG_XACT_COMMIT)
5087         {
5088                 xl_xact_commit *recordXactCommitData;
5089
5090                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5091                 recordXtime = recordXactCommitData->xact_time;
5092         }
5093         else if (record_info == XLOG_XACT_ABORT)
5094         {
5095                 xl_xact_abort *recordXactAbortData;
5096
5097                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5098                 recordXtime = recordXactAbortData->xact_time;
5099         }
5100         else
5101                 return false;
5102
5103         /* Do we have a PITR target at all? */
5104         if (!recoveryTarget)
5105         {
5106                 recoveryLastXTime = recordXtime;
5107                 return false;
5108         }
5109
5110         if (recoveryTargetExact)
5111         {
5112                 /*
5113                  * there can be only one transaction end record with this exact
5114                  * transactionid
5115                  *
5116                  * when testing for an xid, we MUST test for equality only, since
5117                  * transactions are numbered in the order they start, not the order
5118                  * they complete. A higher numbered xid will complete before you about
5119                  * 50% of the time...
5120                  */
5121                 stopsHere = (record->xl_xid == recoveryTargetXid);
5122                 if (stopsHere)
5123                         *includeThis = recoveryTargetInclusive;
5124         }
5125         else
5126         {
5127                 /*
5128                  * there can be many transactions that share the same commit time, so
5129                  * we stop after the last one, if we are inclusive, or stop at the
5130                  * first one if we are exclusive
5131                  */
5132                 if (recoveryTargetInclusive)
5133                         stopsHere = (recordXtime > recoveryTargetTime);
5134                 else
5135                         stopsHere = (recordXtime >= recoveryTargetTime);
5136                 if (stopsHere)
5137                         *includeThis = false;
5138         }
5139
5140         if (stopsHere)
5141         {
5142                 recoveryStopXid = record->xl_xid;
5143                 recoveryStopTime = recordXtime;
5144                 recoveryStopAfter = *includeThis;
5145
5146                 if (record_info == XLOG_XACT_COMMIT)
5147                 {
5148                         if (recoveryStopAfter)
5149                                 ereport(LOG,
5150                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5151                                                                 recoveryStopXid,
5152                                                                 timestamptz_to_str(recoveryStopTime))));
5153                         else
5154                                 ereport(LOG,
5155                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5156                                                                 recoveryStopXid,
5157                                                                 timestamptz_to_str(recoveryStopTime))));
5158                 }
5159                 else
5160                 {
5161                         if (recoveryStopAfter)
5162                                 ereport(LOG,
5163                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5164                                                                 recoveryStopXid,
5165                                                                 timestamptz_to_str(recoveryStopTime))));
5166                         else
5167                                 ereport(LOG,
5168                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5169                                                                 recoveryStopXid,
5170                                                                 timestamptz_to_str(recoveryStopTime))));
5171                 }
5172
5173                 if (recoveryStopAfter)
5174                         recoveryLastXTime = recordXtime;
5175         }
5176         else
5177                 recoveryLastXTime = recordXtime;
5178
5179         return stopsHere;
5180 }
5181
5182 /*
5183  * This must be called ONCE during postmaster or standalone-backend startup
5184  */
5185 void
5186 StartupXLOG(void)
5187 {
5188         XLogCtlInsert *Insert;
5189         CheckPoint      checkPoint;
5190         bool            wasShutdown;
5191         bool            reachedStopPoint = false;
5192         bool            haveBackupLabel = false;
5193         XLogRecPtr      RecPtr,
5194                                 LastRec,
5195                                 checkPointLoc,
5196                                 backupStopLoc,
5197                                 EndOfLog;
5198         uint32          endLogId;
5199         uint32          endLogSeg;
5200         XLogRecord *record;
5201         uint32          freespace;
5202         TransactionId oldestActiveXID;
5203         bool            bgwriterLaunched = false;
5204
5205         /*
5206          * Read control file and check XLOG status looks valid.
5207          *
5208          * Note: in most control paths, *ControlFile is already valid and we need
5209          * not do ReadControlFile() here, but might as well do it to be sure.
5210          */
5211         ReadControlFile();
5212
5213         if (ControlFile->state < DB_SHUTDOWNED ||
5214                 ControlFile->state > DB_IN_PRODUCTION ||
5215                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5216                 ereport(FATAL,
5217                                 (errmsg("control file contains invalid data")));
5218
5219         if (ControlFile->state == DB_SHUTDOWNED)
5220                 ereport(LOG,
5221                                 (errmsg("database system was shut down at %s",
5222                                                 str_time(ControlFile->time))));
5223         else if (ControlFile->state == DB_SHUTDOWNING)
5224                 ereport(LOG,
5225                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5226                                                 str_time(ControlFile->time))));
5227         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5228                 ereport(LOG,
5229                    (errmsg("database system was interrupted while in recovery at %s",
5230                                    str_time(ControlFile->time)),
5231                         errhint("This probably means that some data is corrupted and"
5232                                         " you will have to use the last backup for recovery.")));
5233         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5234                 ereport(LOG,
5235                                 (errmsg("database system was interrupted while in recovery at log time %s",
5236                                                 str_time(ControlFile->checkPointCopy.time)),
5237                                  errhint("If this has occurred more than once some data might be corrupted"
5238                           " and you might need to choose an earlier recovery target.")));
5239         else if (ControlFile->state == DB_IN_PRODUCTION)
5240                 ereport(LOG,
5241                           (errmsg("database system was interrupted; last known up at %s",
5242                                           str_time(ControlFile->time))));
5243
5244         /* This is just to allow attaching to startup process with a debugger */
5245 #ifdef XLOG_REPLAY_DELAY
5246         if (ControlFile->state != DB_SHUTDOWNED)
5247                 pg_usleep(60000000L);
5248 #endif
5249
5250         /*
5251          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5252          * someone has performed a copy for PITR, these directories may have been
5253          * excluded and need to be re-created.
5254          */
5255         ValidateXLOGDirectoryStructure();
5256
5257         /*
5258          * Clear out any old relcache cache files.  This is *necessary* if we
5259          * do any WAL replay, since that would probably result in the cache files
5260          * being out of sync with database reality.  In theory we could leave
5261          * them in place if the database had been cleanly shut down, but it
5262          * seems safest to just remove them always and let them be rebuilt
5263          * during the first backend startup.
5264          */
5265         RelationCacheInitFileRemove();
5266
5267         /*
5268          * Initialize on the assumption we want to recover to the same timeline
5269          * that's active according to pg_control.
5270          */
5271         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5272
5273         /*
5274          * Check for recovery control file, and if so set up state for offline
5275          * recovery
5276          */
5277         readRecoveryCommandFile();
5278
5279         /* Now we can determine the list of expected TLIs */
5280         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
5281
5282         /*
5283          * If pg_control's timeline is not in expectedTLIs, then we cannot
5284          * proceed: the backup is not part of the history of the requested
5285          * timeline.
5286          */
5287         if (!list_member_int(expectedTLIs,
5288                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
5289                 ereport(FATAL,
5290                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
5291                                                 recoveryTargetTLI,
5292                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
5293
5294         if (read_backup_label(&checkPointLoc, &backupStopLoc))
5295         {
5296                 /*
5297                  * When a backup_label file is present, we want to roll forward from
5298                  * the checkpoint it identifies, rather than using pg_control.
5299                  */
5300                 record = ReadCheckpointRecord(checkPointLoc, 0);
5301                 if (record != NULL)
5302                 {
5303                         ereport(DEBUG1,
5304                                         (errmsg("checkpoint record is at %X/%X",
5305                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5306                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5307                 }
5308                 else
5309                 {
5310                         ereport(PANIC,
5311                                         (errmsg("could not locate required checkpoint record"),
5312                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5313                 }
5314                 /* set flag to delete it later */
5315                 haveBackupLabel = true;
5316         }
5317         else
5318         {
5319                 /*
5320                  * Get the last valid checkpoint record.  If the latest one according
5321                  * to pg_control is broken, try the next-to-last one.
5322                  */
5323                 checkPointLoc = ControlFile->checkPoint;
5324                 record = ReadCheckpointRecord(checkPointLoc, 1);
5325                 if (record != NULL)
5326                 {
5327                         ereport(DEBUG1,
5328                                         (errmsg("checkpoint record is at %X/%X",
5329                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5330                 }
5331                 else
5332                 {
5333                         checkPointLoc = ControlFile->prevCheckPoint;
5334                         record = ReadCheckpointRecord(checkPointLoc, 2);
5335                         if (record != NULL)
5336                         {
5337                                 ereport(LOG,
5338                                                 (errmsg("using previous checkpoint record at %X/%X",
5339                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5340                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5341                         }
5342                         else
5343                                 ereport(PANIC,
5344                                          (errmsg("could not locate a valid checkpoint record")));
5345                 }
5346         }
5347
5348         LastRec = RecPtr = checkPointLoc;
5349         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5350         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5351
5352         ereport(DEBUG1,
5353                         (errmsg("redo record is at %X/%X; shutdown %s",
5354                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
5355                                         wasShutdown ? "TRUE" : "FALSE")));
5356         ereport(DEBUG1,
5357                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5358                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5359                                         checkPoint.nextOid)));
5360         ereport(DEBUG1,
5361                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5362                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5363         ereport(DEBUG1,
5364                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5365                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5366         if (!TransactionIdIsNormal(checkPoint.nextXid))
5367                 ereport(PANIC,
5368                                 (errmsg("invalid next transaction ID")));
5369
5370         ShmemVariableCache->nextXid = checkPoint.nextXid;
5371         ShmemVariableCache->nextOid = checkPoint.nextOid;
5372         ShmemVariableCache->oidCount = 0;
5373         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5374         ShmemVariableCache->oldestXid = checkPoint.oldestXid;
5375         ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
5376
5377         /*
5378          * We must replay WAL entries using the same TimeLineID they were created
5379          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5380          * also xlog_redo()).
5381          */
5382         ThisTimeLineID = checkPoint.ThisTimeLineID;
5383
5384         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5385
5386         if (XLByteLT(RecPtr, checkPoint.redo))
5387                 ereport(PANIC,
5388                                 (errmsg("invalid redo in checkpoint record")));
5389
5390         /*
5391          * Check whether we need to force recovery from WAL.  If it appears to
5392          * have been a clean shutdown and we did not have a recovery.conf file,
5393          * then assume no recovery needed.
5394          */
5395         if (XLByteLT(checkPoint.redo, RecPtr))
5396         {
5397                 if (wasShutdown)
5398                         ereport(PANIC,
5399                                         (errmsg("invalid redo record in shutdown checkpoint")));
5400                 InRecovery = true;
5401         }
5402         else if (ControlFile->state != DB_SHUTDOWNED)
5403                 InRecovery = true;
5404         else if (InArchiveRecovery)
5405         {
5406                 /* force recovery due to presence of recovery.conf */
5407                 InRecovery = true;
5408         }
5409
5410         /* REDO */
5411         if (InRecovery)
5412         {
5413                 int                     rmid;
5414
5415                 /*
5416                  * Update pg_control to show that we are recovering and to show the
5417                  * selected checkpoint as the place we are starting from. We also mark
5418                  * pg_control with any minimum recovery stop point obtained from a
5419                  * backup history file.
5420                  */
5421                 if (InArchiveRecovery)
5422                 {
5423                         ereport(LOG,
5424                                         (errmsg("automatic recovery in progress")));
5425                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5426                 }
5427                 else
5428                 {
5429                         ereport(LOG,
5430                                         (errmsg("database system was not properly shut down; "
5431                                                         "automatic recovery in progress")));
5432                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5433                 }
5434                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5435                 ControlFile->checkPoint = checkPointLoc;
5436                 ControlFile->checkPointCopy = checkPoint;
5437                 if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
5438                 {
5439                         if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
5440                                 ControlFile->minRecoveryPoint = backupStopLoc;
5441                 }
5442                 ControlFile->time = (pg_time_t) time(NULL);
5443                 /* No need to hold ControlFileLock yet, we aren't up far enough */
5444                 UpdateControlFile();
5445
5446                 /* initialize our local copy of minRecoveryPoint */
5447                 minRecoveryPoint = ControlFile->minRecoveryPoint;
5448
5449                 /*
5450                  * Reset pgstat data, because it may be invalid after recovery.
5451                  */
5452                 pgstat_reset_all();
5453
5454                 /*
5455                  * If there was a backup label file, it's done its job and the info
5456                  * has now been propagated into pg_control.  We must get rid of the
5457                  * label file so that if we crash during recovery, we'll pick up at
5458                  * the latest recovery restartpoint instead of going all the way back
5459                  * to the backup start point.  It seems prudent though to just rename
5460                  * the file out of the way rather than delete it completely.
5461                  */
5462                 if (haveBackupLabel)
5463                 {
5464                         unlink(BACKUP_LABEL_OLD);
5465                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5466                                 ereport(FATAL,
5467                                                 (errcode_for_file_access(),
5468                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5469                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5470                 }
5471
5472                 /* Initialize resource managers */
5473                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5474                 {
5475                         if (RmgrTable[rmid].rm_startup != NULL)
5476                                 RmgrTable[rmid].rm_startup();
5477                 }
5478
5479                 /*
5480                  * Find the first record that logically follows the checkpoint --- it
5481                  * might physically precede it, though.
5482                  */
5483                 if (XLByteLT(checkPoint.redo, RecPtr))
5484                 {
5485                         /* back up to find the record */
5486                         record = ReadRecord(&(checkPoint.redo), PANIC);
5487                 }
5488                 else
5489                 {
5490                         /* just have to read next record after CheckPoint */
5491                         record = ReadRecord(NULL, LOG);
5492                 }
5493
5494                 if (record != NULL)
5495                 {
5496                         bool            recoveryContinue = true;
5497                         bool            recoveryApply = true;
5498                         bool            reachedMinRecoveryPoint = false;
5499                         ErrorContextCallback errcontext;
5500
5501                         /* use volatile pointer to prevent code rearrangement */
5502                         volatile XLogCtlData *xlogctl = XLogCtl;
5503
5504                         /* initialize shared replayEndRecPtr */
5505                         SpinLockAcquire(&xlogctl->info_lck);
5506                         xlogctl->replayEndRecPtr = ReadRecPtr;
5507                         SpinLockRelease(&xlogctl->info_lck);
5508
5509                         InRedo = true;
5510
5511                         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
5512                                 ereport(LOG,
5513                                                 (errmsg("redo starts at %X/%X",
5514                                                                 ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5515                         else
5516                                 ereport(LOG,
5517                                                 (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
5518                                                                 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5519                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
5520
5521                         /*
5522                          * Let postmaster know we've started redo now, so that it can
5523                          * launch bgwriter to perform restartpoints.  We don't bother
5524                          * during crash recovery as restartpoints can only be performed
5525                          * during archive recovery.  And we'd like to keep crash recovery
5526                          * simple, to avoid introducing bugs that could you from
5527                          * recovering after crash.
5528                          *
5529                          * After this point, we can no longer assume that we're the only
5530                          * process in addition to postmaster!  Also, fsync requests are
5531                          * subsequently to be handled by the bgwriter, not locally.
5532                          */
5533                         if (InArchiveRecovery && IsUnderPostmaster)
5534                         {
5535                                 SetForwardFsyncRequests();
5536                                 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5537                                 bgwriterLaunched = true;
5538                         }
5539
5540                         /*
5541                          * main redo apply loop
5542                          */
5543                         do
5544                         {
5545 #ifdef WAL_DEBUG
5546                                 if (XLOG_DEBUG)
5547                                 {
5548                                         StringInfoData buf;
5549
5550                                         initStringInfo(&buf);
5551                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5552                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5553                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
5554                                         xlog_outrec(&buf, record);
5555                                         appendStringInfo(&buf, " - ");
5556                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5557                                                                                                            record->xl_info,
5558                                                                                                          XLogRecGetData(record));
5559                                         elog(LOG, "%s", buf.data);
5560                                         pfree(buf.data);
5561                                 }
5562 #endif
5563
5564                                 /*
5565                                  * Check if we were requested to re-read config file.
5566                                  */
5567                                 if (got_SIGHUP)
5568                                 {
5569                                         got_SIGHUP = false;
5570                                         ProcessConfigFile(PGC_SIGHUP);
5571                                 }
5572
5573                                 /*
5574                                  * Check if we were requested to exit without finishing
5575                                  * recovery.
5576                                  */
5577                                 if (shutdown_requested)
5578                                         proc_exit(1);
5579
5580                                 /*
5581                                  * Have we passed our safe starting point? If so, we can tell
5582                                  * postmaster that the database is consistent now.
5583                                  */
5584                                 if (!reachedMinRecoveryPoint &&
5585                                         XLByteLT(minRecoveryPoint, EndRecPtr))
5586                                 {
5587                                         reachedMinRecoveryPoint = true;
5588                                         if (InArchiveRecovery)
5589                                         {
5590                                                 ereport(LOG,
5591                                                           (errmsg("consistent recovery state reached")));
5592                                                 if (IsUnderPostmaster)
5593                                                         SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
5594                                         }
5595                                 }
5596
5597                                 /*
5598                                  * Have we reached our recovery target?
5599                                  */
5600                                 if (recoveryStopsHere(record, &recoveryApply))
5601                                 {
5602                                         reachedStopPoint = true;        /* see below */
5603                                         recoveryContinue = false;
5604                                         if (!recoveryApply)
5605                                                 break;
5606                                 }
5607
5608                                 /* Setup error traceback support for ereport() */
5609                                 errcontext.callback = rm_redo_error_callback;
5610                                 errcontext.arg = (void *) record;
5611                                 errcontext.previous = error_context_stack;
5612                                 error_context_stack = &errcontext;
5613
5614                                 /* nextXid must be beyond record's xid */
5615                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5616                                                                                                  ShmemVariableCache->nextXid))
5617                                 {
5618                                         ShmemVariableCache->nextXid = record->xl_xid;
5619                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5620                                 }
5621
5622                                 /*
5623                                  * Update shared replayEndRecPtr before replaying this record,
5624                                  * so that XLogFlush will update minRecoveryPoint correctly.
5625                                  */
5626                                 SpinLockAcquire(&xlogctl->info_lck);
5627                                 xlogctl->replayEndRecPtr = EndRecPtr;
5628                                 SpinLockRelease(&xlogctl->info_lck);
5629
5630                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5631
5632                                 /* Pop the error context stack */
5633                                 error_context_stack = errcontext.previous;
5634
5635                                 LastRec = ReadRecPtr;
5636
5637                                 record = ReadRecord(NULL, LOG);
5638                         } while (record != NULL && recoveryContinue);
5639
5640                         /*
5641                          * end of main redo apply loop
5642                          */
5643
5644                         ereport(LOG,
5645                                         (errmsg("redo done at %X/%X",
5646                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5647                         if (recoveryLastXTime)
5648                                 ereport(LOG,
5649                                          (errmsg("last completed transaction was at log time %s",
5650                                                          timestamptz_to_str(recoveryLastXTime))));
5651                         InRedo = false;
5652                 }
5653                 else
5654                 {
5655                         /* there are no WAL records following the checkpoint */
5656                         ereport(LOG,
5657                                         (errmsg("redo is not required")));
5658                 }
5659         }
5660
5661         /*
5662          * Re-fetch the last valid or last applied record, so we can identify the
5663          * exact endpoint of what we consider the valid portion of WAL.
5664          */
5665         record = ReadRecord(&LastRec, PANIC);
5666         EndOfLog = EndRecPtr;
5667         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
5668
5669         /*
5670          * Complain if we did not roll forward far enough to render the backup
5671          * dump consistent.  Note: it is indeed okay to look at the local variable
5672          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
5673          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
5674          * advanced beyond the WAL we processed.
5675          */
5676         if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
5677         {
5678                 if (reachedStopPoint)   /* stopped because of stop request */
5679                         ereport(FATAL,
5680                                         (errmsg("requested recovery stop point is before consistent recovery point")));
5681                 else    /* ran off end of WAL */
5682                         ereport(FATAL,
5683                                         (errmsg("WAL ends before consistent recovery point")));
5684         }
5685
5686         /*
5687          * Consider whether we need to assign a new timeline ID.
5688          *
5689          * If we are doing an archive recovery, we always assign a new ID.      This
5690          * handles a couple of issues.  If we stopped short of the end of WAL
5691          * during recovery, then we are clearly generating a new timeline and must
5692          * assign it a unique new ID.  Even if we ran to the end, modifying the
5693          * current last segment is problematic because it may result in trying to
5694          * overwrite an already-archived copy of that segment, and we encourage
5695          * DBAs to make their archive_commands reject that.  We can dodge the
5696          * problem by making the new active segment have a new timeline ID.
5697          *
5698          * In a normal crash recovery, we can just extend the timeline we were in.
5699          */
5700         if (InArchiveRecovery)
5701         {
5702                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5703                 ereport(LOG,
5704                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5705                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5706                                                          curFileTLI, endLogId, endLogSeg);
5707         }
5708
5709         /* Save the selected TimeLineID in shared memory, too */
5710         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5711
5712         /*
5713          * We are now done reading the old WAL.  Turn off archive fetching if it
5714          * was active, and make a writable copy of the last WAL segment. (Note
5715          * that we also have a copy of the last block of the old WAL in readBuf;
5716          * we will use that below.)
5717          */
5718         if (InArchiveRecovery)
5719                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
5720
5721         /*
5722          * Prepare to write WAL starting at EndOfLog position, and init xlog
5723          * buffer cache using the block containing the last record from the
5724          * previous incarnation.
5725          */
5726         openLogId = endLogId;
5727         openLogSeg = endLogSeg;
5728         openLogFile = XLogFileOpen(openLogId, openLogSeg);
5729         openLogOff = 0;
5730         Insert = &XLogCtl->Insert;
5731         Insert->PrevRecord = LastRec;
5732         XLogCtl->xlblocks[0].xlogid = openLogId;
5733         XLogCtl->xlblocks[0].xrecoff =
5734                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5735
5736         /*
5737          * Tricky point here: readBuf contains the *last* block that the LastRec
5738          * record spans, not the one it starts in.      The last block is indeed the
5739          * one we want to use.
5740          */
5741         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
5742         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
5743         Insert->currpos = (char *) Insert->currpage +
5744                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
5745
5746         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5747
5748         XLogCtl->Write.LogwrtResult = LogwrtResult;
5749         Insert->LogwrtResult = LogwrtResult;
5750         XLogCtl->LogwrtResult = LogwrtResult;
5751
5752         XLogCtl->LogwrtRqst.Write = EndOfLog;
5753         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5754
5755         freespace = INSERT_FREESPACE(Insert);
5756         if (freespace > 0)
5757         {
5758                 /* Make sure rest of page is zero */
5759                 MemSet(Insert->currpos, 0, freespace);
5760                 XLogCtl->Write.curridx = 0;
5761         }
5762         else
5763         {
5764                 /*
5765                  * Whenever Write.LogwrtResult points to exactly the end of a page,
5766                  * Write.curridx must point to the *next* page (see XLogWrite()).
5767                  *
5768                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5769                  * this is sufficient.  The first actual attempt to insert a log
5770                  * record will advance the insert state.
5771                  */
5772                 XLogCtl->Write.curridx = NextBufIdx(0);
5773         }
5774
5775         /* Pre-scan prepared transactions to find out the range of XIDs present */
5776         oldestActiveXID = PrescanPreparedTransactions();
5777
5778         if (InRecovery)
5779         {
5780                 int                     rmid;
5781
5782                 /*
5783                  * Resource managers might need to write WAL records, eg, to record
5784                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
5785                  * this process only.
5786                  */
5787                 LocalSetXLogInsertAllowed();
5788
5789                 /*
5790                  * Allow resource managers to do any required cleanup.
5791                  */
5792                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5793                 {
5794                         if (RmgrTable[rmid].rm_cleanup != NULL)
5795                                 RmgrTable[rmid].rm_cleanup();
5796                 }
5797
5798                 /* Disallow XLogInsert again */
5799                 LocalXLogInsertAllowed = -1;
5800
5801                 /*
5802                  * Check to see if the XLOG sequence contained any unresolved
5803                  * references to uninitialized pages.
5804                  */
5805                 XLogCheckInvalidPages();
5806
5807                 /*
5808                  * Perform a checkpoint to update all our recovery activity to disk.
5809                  *
5810                  * Note that we write a shutdown checkpoint rather than an on-line
5811                  * one. This is not particularly critical, but since we may be
5812                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5813                  * the rule that TLI only changes in shutdown checkpoints, which
5814                  * allows some extra error checking in xlog_redo.
5815                  */
5816                 if (bgwriterLaunched)
5817                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5818                                                           CHECKPOINT_IMMEDIATE |
5819                                                           CHECKPOINT_WAIT);
5820                 else
5821                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
5822
5823                 /*
5824                  * And finally, execute the recovery_end_command, if any.
5825                  */
5826                 if (recoveryEndCommand)
5827                         ExecuteRecoveryEndCommand();
5828         }
5829
5830         /*
5831          * Preallocate additional log files, if wanted.
5832          */
5833         PreallocXlogFiles(EndOfLog);
5834
5835         /*
5836          * Okay, we're officially UP.
5837          */
5838         InRecovery = false;
5839
5840         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5841         ControlFile->state = DB_IN_PRODUCTION;
5842         ControlFile->time = (pg_time_t) time(NULL);
5843         UpdateControlFile();
5844         LWLockRelease(ControlFileLock);
5845
5846         /* start the archive_timeout timer running */
5847         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
5848
5849         /* initialize shared-memory copy of latest checkpoint XID/epoch */
5850         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5851         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
5852
5853         /* also initialize latestCompletedXid, to nextXid - 1 */
5854         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5855         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
5856
5857         /* Start up the commit log and related stuff, too */
5858         StartupCLOG();
5859         StartupSUBTRANS(oldestActiveXID);
5860         StartupMultiXact();
5861
5862         /* Reload shared-memory state for prepared transactions */
5863         RecoverPreparedTransactions();
5864
5865         /* Shut down readFile facility, free space */
5866         if (readFile >= 0)
5867         {
5868                 close(readFile);
5869                 readFile = -1;
5870         }
5871         if (readBuf)
5872         {
5873                 free(readBuf);
5874                 readBuf = NULL;
5875         }
5876         if (readRecordBuf)
5877         {
5878                 free(readRecordBuf);
5879                 readRecordBuf = NULL;
5880                 readRecordBufSize = 0;
5881         }
5882
5883         /*
5884          * All done.  Allow backends to write WAL.  (Although the bool flag is
5885          * probably atomic in itself, we use the info_lck here to ensure that
5886          * there are no race conditions concerning visibility of other recent
5887          * updates to shared memory.)
5888          */
5889         {
5890                 /* use volatile pointer to prevent code rearrangement */
5891                 volatile XLogCtlData *xlogctl = XLogCtl;
5892
5893                 SpinLockAcquire(&xlogctl->info_lck);
5894                 xlogctl->SharedRecoveryInProgress = false;
5895                 SpinLockRelease(&xlogctl->info_lck);
5896         }
5897 }
5898
5899 /*
5900  * Is the system still in recovery?
5901  *
5902  * Unlike testing InRecovery, this works in any process that's connected to
5903  * shared memory.
5904  *
5905  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
5906  * variables the first time we see that recovery is finished.
5907  */
5908 bool
5909 RecoveryInProgress(void)
5910 {
5911         /*
5912          * We check shared state each time only until we leave recovery mode.
5913          * We can't re-enter recovery, so there's no need to keep checking after
5914          * the shared variable has once been seen false.
5915          */
5916         if (!LocalRecoveryInProgress)
5917                 return false;
5918         else
5919         {
5920                 /* use volatile pointer to prevent code rearrangement */
5921                 volatile XLogCtlData *xlogctl = XLogCtl;
5922
5923                 /* spinlock is essential on machines with weak memory ordering! */
5924                 SpinLockAcquire(&xlogctl->info_lck);
5925                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
5926                 SpinLockRelease(&xlogctl->info_lck);
5927
5928                 /*
5929                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
5930                  * is finished.  (If you change this, see also
5931                  * LocalSetXLogInsertAllowed.)
5932                  */
5933                 if (!LocalRecoveryInProgress)
5934                         InitXLOGAccess();
5935
5936                 return LocalRecoveryInProgress;
5937         }
5938 }
5939
5940 /*
5941  * Is this process allowed to insert new WAL records?
5942  *
5943  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
5944  * But we also have provisions for forcing the result "true" or "false"
5945  * within specific processes regardless of the global state.
5946  */
5947 bool
5948 XLogInsertAllowed(void)
5949 {
5950         /*
5951          * If value is "unconditionally true" or "unconditionally false",
5952          * just return it.  This provides the normal fast path once recovery
5953          * is known done.
5954          */
5955         if (LocalXLogInsertAllowed >= 0)
5956                 return (bool) LocalXLogInsertAllowed;
5957
5958         /*
5959          * Else, must check to see if we're still in recovery.
5960          */
5961         if (RecoveryInProgress())
5962                 return false;
5963
5964         /*
5965          * On exit from recovery, reset to "unconditionally true", since there
5966          * is no need to keep checking.
5967          */
5968         LocalXLogInsertAllowed = 1;
5969         return true;
5970 }
5971
5972 /*
5973  * Make XLogInsertAllowed() return true in the current process only.
5974  *
5975  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
5976  * and even call LocalSetXLogInsertAllowed() again after that.
5977  */
5978 static void
5979 LocalSetXLogInsertAllowed(void)
5980 {
5981         Assert(LocalXLogInsertAllowed == -1);
5982         LocalXLogInsertAllowed = 1;
5983
5984         /* Initialize as RecoveryInProgress() would do when switching state */
5985         InitXLOGAccess();
5986 }
5987
5988 /*
5989  * Subroutine to try to fetch and validate a prior checkpoint record.
5990  *
5991  * whichChkpt identifies the checkpoint (merely for reporting purposes).
5992  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
5993  */
5994 static XLogRecord *
5995 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
5996 {
5997         XLogRecord *record;
5998
5999         if (!XRecOffIsValid(RecPtr.xrecoff))
6000         {
6001                 switch (whichChkpt)
6002                 {
6003                         case 1:
6004                                 ereport(LOG,
6005                                 (errmsg("invalid primary checkpoint link in control file")));
6006                                 break;
6007                         case 2:
6008                                 ereport(LOG,
6009                                                 (errmsg("invalid secondary checkpoint link in control file")));
6010                                 break;
6011                         default:
6012                                 ereport(LOG,
6013                                    (errmsg("invalid checkpoint link in backup_label file")));
6014                                 break;
6015                 }
6016                 return NULL;
6017         }
6018
6019         record = ReadRecord(&RecPtr, LOG);
6020
6021         if (record == NULL)
6022         {
6023                 switch (whichChkpt)
6024                 {
6025                         case 1:
6026                                 ereport(LOG,
6027                                                 (errmsg("invalid primary checkpoint record")));
6028                                 break;
6029                         case 2:
6030                                 ereport(LOG,
6031                                                 (errmsg("invalid secondary checkpoint record")));
6032                                 break;
6033                         default:
6034                                 ereport(LOG,
6035                                                 (errmsg("invalid checkpoint record")));
6036                                 break;
6037                 }
6038                 return NULL;
6039         }
6040         if (record->xl_rmid != RM_XLOG_ID)
6041         {
6042                 switch (whichChkpt)
6043                 {
6044                         case 1:
6045                                 ereport(LOG,
6046                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6047                                 break;
6048                         case 2:
6049                                 ereport(LOG,
6050                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6051                                 break;
6052                         default:
6053                                 ereport(LOG,
6054                                 (errmsg("invalid resource manager ID in checkpoint record")));
6055                                 break;
6056                 }
6057                 return NULL;
6058         }
6059         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6060                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6061         {
6062                 switch (whichChkpt)
6063                 {
6064                         case 1:
6065                                 ereport(LOG,
6066                                    (errmsg("invalid xl_info in primary checkpoint record")));
6067                                 break;
6068                         case 2:
6069                                 ereport(LOG,
6070                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6071                                 break;
6072                         default:
6073                                 ereport(LOG,
6074                                                 (errmsg("invalid xl_info in checkpoint record")));
6075                                 break;
6076                 }
6077                 return NULL;
6078         }
6079         if (record->xl_len != sizeof(CheckPoint) ||
6080                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6081         {
6082                 switch (whichChkpt)
6083                 {
6084                         case 1:
6085                                 ereport(LOG,
6086                                         (errmsg("invalid length of primary checkpoint record")));
6087                                 break;
6088                         case 2:
6089                                 ereport(LOG,
6090                                   (errmsg("invalid length of secondary checkpoint record")));
6091                                 break;
6092                         default:
6093                                 ereport(LOG,
6094                                                 (errmsg("invalid length of checkpoint record")));
6095                                 break;
6096                 }
6097                 return NULL;
6098         }
6099         return record;
6100 }
6101
6102 /*
6103  * This must be called during startup of a backend process, except that
6104  * it need not be called in a standalone backend (which does StartupXLOG
6105  * instead).  We need to initialize the local copies of ThisTimeLineID and
6106  * RedoRecPtr.
6107  *
6108  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6109  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6110  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6111  */
6112 void
6113 InitXLOGAccess(void)
6114 {
6115         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6116         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6117         Assert(ThisTimeLineID != 0);
6118
6119         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6120         (void) GetRedoRecPtr();
6121 }
6122
6123 /*
6124  * Once spawned, a backend may update its local RedoRecPtr from
6125  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6126  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6127  */
6128 XLogRecPtr
6129 GetRedoRecPtr(void)
6130 {
6131         /* use volatile pointer to prevent code rearrangement */
6132         volatile XLogCtlData *xlogctl = XLogCtl;
6133
6134         SpinLockAcquire(&xlogctl->info_lck);
6135         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
6136         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6137         SpinLockRelease(&xlogctl->info_lck);
6138
6139         return RedoRecPtr;
6140 }
6141
6142 /*
6143  * GetInsertRecPtr -- Returns the current insert position.
6144  *
6145  * NOTE: The value *actually* returned is the position of the last full
6146  * xlog page. It lags behind the real insert position by at most 1 page.
6147  * For that, we don't need to acquire WALInsertLock which can be quite
6148  * heavily contended, and an approximation is enough for the current
6149  * usage of this function.
6150  */
6151 XLogRecPtr
6152 GetInsertRecPtr(void)
6153 {
6154         /* use volatile pointer to prevent code rearrangement */
6155         volatile XLogCtlData *xlogctl = XLogCtl;
6156         XLogRecPtr      recptr;
6157
6158         SpinLockAcquire(&xlogctl->info_lck);
6159         recptr = xlogctl->LogwrtRqst.Write;
6160         SpinLockRelease(&xlogctl->info_lck);
6161
6162         return recptr;
6163 }
6164
6165 /*
6166  * Get the time of the last xlog segment switch
6167  */
6168 pg_time_t
6169 GetLastSegSwitchTime(void)
6170 {
6171         pg_time_t       result;
6172
6173         /* Need WALWriteLock, but shared lock is sufficient */
6174         LWLockAcquire(WALWriteLock, LW_SHARED);
6175         result = XLogCtl->Write.lastSegSwitchTime;
6176         LWLockRelease(WALWriteLock);
6177
6178         return result;
6179 }
6180
6181 /*
6182  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6183  *
6184  * This is exported for use by code that would like to have 64-bit XIDs.
6185  * We don't really support such things, but all XIDs within the system
6186  * can be presumed "close to" the result, and thus the epoch associated
6187  * with them can be determined.
6188  */
6189 void
6190 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6191 {
6192         uint32          ckptXidEpoch;
6193         TransactionId ckptXid;
6194         TransactionId nextXid;
6195
6196         /* Must read checkpoint info first, else have race condition */
6197         {
6198                 /* use volatile pointer to prevent code rearrangement */
6199                 volatile XLogCtlData *xlogctl = XLogCtl;
6200
6201                 SpinLockAcquire(&xlogctl->info_lck);
6202                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6203                 ckptXid = xlogctl->ckptXid;
6204                 SpinLockRelease(&xlogctl->info_lck);
6205         }
6206
6207         /* Now fetch current nextXid */
6208         nextXid = ReadNewTransactionId();
6209
6210         /*
6211          * nextXid is certainly logically later than ckptXid.  So if it's
6212          * numerically less, it must have wrapped into the next epoch.
6213          */
6214         if (nextXid < ckptXid)
6215                 ckptXidEpoch++;
6216
6217         *xid = nextXid;
6218         *epoch = ckptXidEpoch;
6219 }
6220
6221 /*
6222  * This must be called ONCE during postmaster or standalone-backend shutdown
6223  */
6224 void
6225 ShutdownXLOG(int code, Datum arg)
6226 {
6227         ereport(LOG,
6228                         (errmsg("shutting down")));
6229
6230         if (RecoveryInProgress())
6231                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6232         else
6233         {
6234                 /*
6235                  * If archiving is enabled, rotate the last XLOG file so that all the
6236                  * remaining records are archived (postmaster wakes up the archiver
6237                  * process one more time at the end of shutdown). The checkpoint
6238                  * record will go to the next XLOG file and won't be archived (yet).
6239                  */
6240                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6241                         RequestXLogSwitch();
6242
6243                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6244         }
6245         ShutdownCLOG();
6246         ShutdownSUBTRANS();
6247         ShutdownMultiXact();
6248
6249         ereport(LOG,
6250                         (errmsg("database system is shut down")));
6251 }
6252
6253 /*
6254  * Log start of a checkpoint.
6255  */
6256 static void
6257 LogCheckpointStart(int flags, bool restartpoint)
6258 {
6259         const char *msg;
6260
6261         /*
6262          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
6263          * the main message, but what about all the flags?
6264          */
6265         if (restartpoint)
6266                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
6267         else
6268                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
6269
6270         elog(LOG, msg,
6271                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6272                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6273                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6274                  (flags & CHECKPOINT_FORCE) ? " force" : "",
6275                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
6276                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
6277                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
6278 }
6279
6280 /*
6281  * Log end of a checkpoint.
6282  */
6283 static void
6284 LogCheckpointEnd(bool restartpoint)
6285 {
6286         long            write_secs,
6287                                 sync_secs,
6288                                 total_secs;
6289         int                     write_usecs,
6290                                 sync_usecs,
6291                                 total_usecs;
6292
6293         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6294
6295         TimestampDifference(CheckpointStats.ckpt_start_t,
6296                                                 CheckpointStats.ckpt_end_t,
6297                                                 &total_secs, &total_usecs);
6298
6299         TimestampDifference(CheckpointStats.ckpt_write_t,
6300                                                 CheckpointStats.ckpt_sync_t,
6301                                                 &write_secs, &write_usecs);
6302
6303         TimestampDifference(CheckpointStats.ckpt_sync_t,
6304                                                 CheckpointStats.ckpt_sync_end_t,
6305                                                 &sync_secs, &sync_usecs);
6306
6307         if (restartpoint)
6308                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
6309                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
6310                          CheckpointStats.ckpt_bufs_written,
6311                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6312                          write_secs, write_usecs / 1000,
6313                          sync_secs, sync_usecs / 1000,
6314                          total_secs, total_usecs / 1000);
6315         else
6316                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
6317                          "%d transaction log file(s) added, %d removed, %d recycled; "
6318                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
6319                          CheckpointStats.ckpt_bufs_written,
6320                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6321                          CheckpointStats.ckpt_segs_added,
6322                          CheckpointStats.ckpt_segs_removed,
6323                          CheckpointStats.ckpt_segs_recycled,
6324                          write_secs, write_usecs / 1000,
6325                          sync_secs, sync_usecs / 1000,
6326                          total_secs, total_usecs / 1000);
6327 }
6328
6329 /*
6330  * Perform a checkpoint --- either during shutdown, or on-the-fly
6331  *
6332  * flags is a bitwise OR of the following:
6333  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6334  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6335  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6336  *              ignoring checkpoint_completion_target parameter.
6337  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
6338  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6339  *              CHECKPOINT_END_OF_RECOVERY).
6340  *
6341  * Note: flags contains other bits, of interest here only for logging purposes.
6342  * In particular note that this routine is synchronous and does not pay
6343  * attention to CHECKPOINT_WAIT.
6344  */
6345 void
6346 CreateCheckPoint(int flags)
6347 {
6348         bool            shutdown;
6349         CheckPoint      checkPoint;
6350         XLogRecPtr      recptr;
6351         XLogCtlInsert *Insert = &XLogCtl->Insert;
6352         XLogRecData rdata;
6353         uint32          freespace;
6354         uint32          _logId;
6355         uint32          _logSeg;
6356         TransactionId *inCommitXids;
6357         int                     nInCommit;
6358
6359         /*
6360          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6361          * issued at a different time.
6362          */
6363         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6364                 shutdown = true;
6365         else
6366                 shutdown = false;
6367
6368         /* sanity check */
6369         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6370                 elog(ERROR, "can't create a checkpoint during recovery");
6371
6372         /*
6373          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
6374          * (This is just pro forma, since in the present system structure there is
6375          * only one process that is allowed to issue checkpoints at any given
6376          * time.)
6377          */
6378         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6379
6380         /*
6381          * Prepare to accumulate statistics.
6382          *
6383          * Note: because it is possible for log_checkpoints to change while a
6384          * checkpoint proceeds, we always accumulate stats, even if
6385          * log_checkpoints is currently off.
6386          */
6387         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6388         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6389
6390         /*
6391          * Use a critical section to force system panic if we have trouble.
6392          */
6393         START_CRIT_SECTION();
6394
6395         if (shutdown)
6396         {
6397                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6398                 ControlFile->state = DB_SHUTDOWNING;
6399                 ControlFile->time = (pg_time_t) time(NULL);
6400                 UpdateControlFile();
6401                 LWLockRelease(ControlFileLock);
6402         }
6403
6404         /*
6405          * Let smgr prepare for checkpoint; this has to happen before we determine
6406          * the REDO pointer.  Note that smgr must not do anything that'd have to
6407          * be undone if we decide no checkpoint is needed.
6408          */
6409         smgrpreckpt();
6410
6411         /* Begin filling in the checkpoint WAL record */
6412         MemSet(&checkPoint, 0, sizeof(checkPoint));
6413         checkPoint.time = (pg_time_t) time(NULL);
6414
6415         /*
6416          * We must hold WALInsertLock while examining insert state to determine
6417          * the checkpoint REDO pointer.
6418          */
6419         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6420
6421         /*
6422          * If this isn't a shutdown or forced checkpoint, and we have not inserted
6423          * any XLOG records since the start of the last checkpoint, skip the
6424          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
6425          * when the system is idle. That wastes log space, and more importantly it
6426          * exposes us to possible loss of both current and previous checkpoint
6427          * records if the machine crashes just as we're writing the update.
6428          * (Perhaps it'd make even more sense to checkpoint only when the previous
6429          * checkpoint record is in a different xlog page?)
6430          *
6431          * We have to make two tests to determine that nothing has happened since
6432          * the start of the last checkpoint: current insertion point must match
6433          * the end of the last checkpoint record, and its redo pointer must point
6434          * to itself.
6435          */
6436         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6437                                   CHECKPOINT_FORCE)) == 0)
6438         {
6439                 XLogRecPtr      curInsert;
6440
6441                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
6442                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
6443                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
6444                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
6445                         ControlFile->checkPoint.xlogid ==
6446                         ControlFile->checkPointCopy.redo.xlogid &&
6447                         ControlFile->checkPoint.xrecoff ==
6448                         ControlFile->checkPointCopy.redo.xrecoff)
6449                 {
6450                         LWLockRelease(WALInsertLock);
6451                         LWLockRelease(CheckpointLock);
6452                         END_CRIT_SECTION();
6453                         return;
6454                 }
6455         }
6456
6457         /*
6458          * An end-of-recovery checkpoint is created before anyone is allowed to
6459          * write WAL. To allow us to write the checkpoint record, temporarily
6460          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
6461          * initialized, which we need here and in AdvanceXLInsertBuffer.)
6462          */
6463         if (flags & CHECKPOINT_END_OF_RECOVERY)
6464                 LocalSetXLogInsertAllowed();
6465
6466         checkPoint.ThisTimeLineID = ThisTimeLineID;
6467
6468         /*
6469          * Compute new REDO record ptr = location of next XLOG record.
6470          *
6471          * NB: this is NOT necessarily where the checkpoint record itself will be,
6472          * since other backends may insert more XLOG records while we're off doing
6473          * the buffer flush work.  Those XLOG records are logically after the
6474          * checkpoint, even though physically before it.  Got that?
6475          */
6476         freespace = INSERT_FREESPACE(Insert);
6477         if (freespace < SizeOfXLogRecord)
6478         {
6479                 (void) AdvanceXLInsertBuffer(false);
6480                 /* OK to ignore update return flag, since we will do flush anyway */
6481                 freespace = INSERT_FREESPACE(Insert);
6482         }
6483         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
6484
6485         /*
6486          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6487          * must be done while holding the insert lock AND the info_lck.
6488          *
6489          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6490          * pointing past where it really needs to point.  This is okay; the only
6491          * consequence is that XLogInsert might back up whole buffers that it
6492          * didn't really need to.  We can't postpone advancing RedoRecPtr because
6493          * XLogInserts that happen while we are dumping buffers must assume that
6494          * their buffer changes are not included in the checkpoint.
6495          */
6496         {
6497                 /* use volatile pointer to prevent code rearrangement */
6498                 volatile XLogCtlData *xlogctl = XLogCtl;
6499
6500                 SpinLockAcquire(&xlogctl->info_lck);
6501                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
6502                 SpinLockRelease(&xlogctl->info_lck);
6503         }
6504
6505         /*
6506          * Now we can release WAL insert lock, allowing other xacts to proceed
6507          * while we are flushing disk buffers.
6508          */
6509         LWLockRelease(WALInsertLock);
6510
6511         /*
6512          * If enabled, log checkpoint start.  We postpone this until now so as not
6513          * to log anything if we decided to skip the checkpoint.
6514          */
6515         if (log_checkpoints)
6516                 LogCheckpointStart(flags, false);
6517
6518         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
6519
6520         /*
6521          * Before flushing data, we must wait for any transactions that are
6522          * currently in their commit critical sections.  If an xact inserted its
6523          * commit record into XLOG just before the REDO point, then a crash
6524          * restart from the REDO point would not replay that record, which means
6525          * that our flushing had better include the xact's update of pg_clog.  So
6526          * we wait till he's out of his commit critical section before proceeding.
6527          * See notes in RecordTransactionCommit().
6528          *
6529          * Because we've already released WALInsertLock, this test is a bit fuzzy:
6530          * it is possible that we will wait for xacts we didn't really need to
6531          * wait for.  But the delay should be short and it seems better to make
6532          * checkpoint take a bit longer than to hold locks longer than necessary.
6533          * (In fact, the whole reason we have this issue is that xact.c does
6534          * commit record XLOG insertion and clog update as two separate steps
6535          * protected by different locks, but again that seems best on grounds of
6536          * minimizing lock contention.)
6537          *
6538          * A transaction that has not yet set inCommit when we look cannot be at
6539          * risk, since he's not inserted his commit record yet; and one that's
6540          * already cleared it is not at risk either, since he's done fixing clog
6541          * and we will correctly flush the update below.  So we cannot miss any
6542          * xacts we need to wait for.
6543          */
6544         nInCommit = GetTransactionsInCommit(&inCommitXids);
6545         if (nInCommit > 0)
6546         {
6547                 do
6548                 {
6549                         pg_usleep(10000L);      /* wait for 10 msec */
6550                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
6551         }
6552         pfree(inCommitXids);
6553
6554         /*
6555          * Get the other info we need for the checkpoint record.
6556          */
6557         LWLockAcquire(XidGenLock, LW_SHARED);
6558         checkPoint.nextXid = ShmemVariableCache->nextXid;
6559         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
6560         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
6561         LWLockRelease(XidGenLock);
6562
6563         /* Increase XID epoch if we've wrapped around since last checkpoint */
6564         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6565         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
6566                 checkPoint.nextXidEpoch++;
6567
6568         LWLockAcquire(OidGenLock, LW_SHARED);
6569         checkPoint.nextOid = ShmemVariableCache->nextOid;
6570         if (!shutdown)
6571                 checkPoint.nextOid += ShmemVariableCache->oidCount;
6572         LWLockRelease(OidGenLock);
6573
6574         MultiXactGetCheckptMulti(shutdown,
6575                                                          &checkPoint.nextMulti,
6576                                                          &checkPoint.nextMultiOffset);
6577
6578         /*
6579          * Having constructed the checkpoint record, ensure all shmem disk buffers
6580          * and commit-log buffers are flushed to disk.
6581          *
6582          * This I/O could fail for various reasons.  If so, we will fail to
6583          * complete the checkpoint, but there is no reason to force a system
6584          * panic. Accordingly, exit critical section while doing it.
6585          */
6586         END_CRIT_SECTION();
6587
6588         CheckPointGuts(checkPoint.redo, flags);
6589
6590         START_CRIT_SECTION();
6591
6592         /*
6593          * Now insert the checkpoint record into XLOG.
6594          */
6595         rdata.data = (char *) (&checkPoint);
6596         rdata.len = sizeof(checkPoint);
6597         rdata.buffer = InvalidBuffer;
6598         rdata.next = NULL;
6599
6600         recptr = XLogInsert(RM_XLOG_ID,
6601                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
6602                                                 XLOG_CHECKPOINT_ONLINE,
6603                                                 &rdata);
6604
6605         XLogFlush(recptr);
6606
6607         /*
6608          * We mustn't write any new WAL after a shutdown checkpoint, or it will
6609          * be overwritten at next startup.  No-one should even try, this just
6610          * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
6611          * we want to just temporarily disable writing until the system has exited
6612          * recovery.
6613          */
6614         if (shutdown)
6615         {
6616                 if (flags & CHECKPOINT_END_OF_RECOVERY)
6617                         LocalXLogInsertAllowed = -1;    /* return to "check" state */
6618                 else
6619                         LocalXLogInsertAllowed = 0;             /* never again write WAL */
6620         }
6621
6622         /*
6623          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
6624          * = end of actual checkpoint record.
6625          */
6626         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
6627                 ereport(PANIC,
6628                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
6629
6630         /*
6631          * Select point at which we can truncate the log, which we base on the
6632          * prior checkpoint's earliest info.
6633          */
6634         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
6635
6636         /*
6637          * Update the control file.
6638          */
6639         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6640         if (shutdown)
6641                 ControlFile->state = DB_SHUTDOWNED;
6642         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6643         ControlFile->checkPoint = ProcLastRecPtr;
6644         ControlFile->checkPointCopy = checkPoint;
6645         ControlFile->time = (pg_time_t) time(NULL);
6646         UpdateControlFile();
6647         LWLockRelease(ControlFileLock);
6648
6649         /* Update shared-memory copy of checkpoint XID/epoch */
6650         {
6651                 /* use volatile pointer to prevent code rearrangement */
6652                 volatile XLogCtlData *xlogctl = XLogCtl;
6653
6654                 SpinLockAcquire(&xlogctl->info_lck);
6655                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
6656                 xlogctl->ckptXid = checkPoint.nextXid;
6657                 SpinLockRelease(&xlogctl->info_lck);
6658         }
6659
6660         /*
6661          * We are now done with critical updates; no need for system panic if we
6662          * have trouble while fooling with old log segments.
6663          */
6664         END_CRIT_SECTION();
6665
6666         /*
6667          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
6668          */
6669         smgrpostckpt();
6670
6671         /*
6672          * Delete old log files (those no longer needed even for previous
6673          * checkpoint).
6674          */
6675         if (_logId || _logSeg)
6676         {
6677                 PrevLogSeg(_logId, _logSeg);
6678                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
6679         }
6680
6681         /*
6682          * Make more log segments if needed.  (Do this after recycling old log
6683          * segments, since that may supply some of the needed files.)
6684          */
6685         if (!shutdown)
6686                 PreallocXlogFiles(recptr);
6687
6688         /*
6689          * Truncate pg_subtrans if possible.  We can throw away all data before
6690          * the oldest XMIN of any running transaction.  No future transaction will
6691          * attempt to reference any pg_subtrans entry older than that (see Asserts
6692          * in subtrans.c).      During recovery, though, we mustn't do this because
6693          * StartupSUBTRANS hasn't been called yet.
6694          */
6695         if (!RecoveryInProgress())
6696                 TruncateSUBTRANS(GetOldestXmin(true, false));
6697
6698         /* All real work is done, but log before releasing lock. */
6699         if (log_checkpoints)
6700                 LogCheckpointEnd(false);
6701
6702         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
6703                                                                          NBuffers,
6704                                                                          CheckpointStats.ckpt_segs_added,
6705                                                                          CheckpointStats.ckpt_segs_removed,
6706                                                                          CheckpointStats.ckpt_segs_recycled);
6707
6708         LWLockRelease(CheckpointLock);
6709 }
6710
6711 /*
6712  * Flush all data in shared memory to disk, and fsync
6713  *
6714  * This is the common code shared between regular checkpoints and
6715  * recovery restartpoints.
6716  */
6717 static void
6718 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
6719 {
6720         CheckPointCLOG();
6721         CheckPointSUBTRANS();
6722         CheckPointMultiXact();
6723         CheckPointBuffers(flags);       /* performs all required fsyncs */
6724         /* We deliberately delay 2PC checkpointing as long as possible */
6725         CheckPointTwoPhase(checkPointRedo);
6726 }
6727
6728 /*
6729  * Save a checkpoint for recovery restart if appropriate
6730  *
6731  * This function is called each time a checkpoint record is read from XLOG.
6732  * It must determine whether the checkpoint represents a safe restartpoint or
6733  * not.  If so, the checkpoint record is stashed in shared memory so that
6734  * CreateRestartPoint can consult it.  (Note that the latter function is
6735  * executed by the bgwriter, while this one will be executed by the startup
6736  * process.)
6737  */
6738 static void
6739 RecoveryRestartPoint(const CheckPoint *checkPoint)
6740 {
6741         int                     rmid;
6742
6743         /* use volatile pointer to prevent code rearrangement */
6744         volatile XLogCtlData *xlogctl = XLogCtl;
6745
6746         /*
6747          * Is it safe to checkpoint?  We must ask each of the resource managers
6748          * whether they have any partial state information that might prevent a
6749          * correct restart from this point.  If so, we skip this opportunity, but
6750          * return at the next checkpoint record for another try.
6751          */
6752         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6753         {
6754                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
6755                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
6756                         {
6757                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
6758                                          rmid,
6759                                          checkPoint->redo.xlogid,
6760                                          checkPoint->redo.xrecoff);
6761                                 return;
6762                         }
6763         }
6764
6765         /*
6766          * Copy the checkpoint record to shared memory, so that bgwriter can use
6767          * it the next time it wants to perform a restartpoint.
6768          */
6769         SpinLockAcquire(&xlogctl->info_lck);
6770         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
6771         memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
6772         SpinLockRelease(&xlogctl->info_lck);
6773 }
6774
6775 /*
6776  * Establish a restartpoint if possible.
6777  *
6778  * This is similar to CreateCheckPoint, but is used during WAL recovery
6779  * to establish a point from which recovery can roll forward without
6780  * replaying the entire recovery log.
6781  *
6782  * Returns true if a new restartpoint was established. We can only establish
6783  * a restartpoint if we have replayed a safe checkpoint record since last
6784  * restartpoint.
6785  */
6786 bool
6787 CreateRestartPoint(int flags)
6788 {
6789         XLogRecPtr      lastCheckPointRecPtr;
6790         CheckPoint      lastCheckPoint;
6791
6792         /* use volatile pointer to prevent code rearrangement */
6793         volatile XLogCtlData *xlogctl = XLogCtl;
6794
6795         /*
6796          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
6797          * happens at a time.
6798          */
6799         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6800
6801         /* Get a local copy of the last safe checkpoint record. */
6802         SpinLockAcquire(&xlogctl->info_lck);
6803         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
6804         memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
6805         SpinLockRelease(&xlogctl->info_lck);
6806
6807         /*
6808          * Check that we're still in recovery mode. It's ok if we exit recovery
6809          * mode after this check, the restart point is valid anyway.
6810          */
6811         if (!RecoveryInProgress())
6812         {
6813                 ereport(DEBUG2,
6814                           (errmsg("skipping restartpoint, recovery has already ended")));
6815                 LWLockRelease(CheckpointLock);
6816                 return false;
6817         }
6818
6819         /*
6820          * If the last checkpoint record we've replayed is already our last
6821          * restartpoint, we can't perform a new restart point. We still update
6822          * minRecoveryPoint in that case, so that if this is a shutdown restart
6823          * point, we won't start up earlier than before. That's not strictly
6824          * necessary, but when we get hot standby capability, it would be rather
6825          * weird if the database opened up for read-only connections at a
6826          * point-in-time before the last shutdown. Such time travel is still
6827          * possible in case of immediate shutdown, though.
6828          *
6829          * We don't explicitly advance minRecoveryPoint when we do create a
6830          * restartpoint. It's assumed that flushing the buffers will do that as a
6831          * side-effect.
6832          */
6833         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
6834                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
6835         {
6836                 XLogRecPtr      InvalidXLogRecPtr = {0, 0};
6837
6838                 ereport(DEBUG2,
6839                                 (errmsg("skipping restartpoint, already performed at %X/%X",
6840                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
6841
6842                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
6843                 LWLockRelease(CheckpointLock);
6844                 return false;
6845         }
6846
6847         if (log_checkpoints)
6848         {
6849                 /*
6850                  * Prepare to accumulate statistics.
6851                  */
6852                 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6853                 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6854
6855                 LogCheckpointStart(flags, true);
6856         }
6857
6858         CheckPointGuts(lastCheckPoint.redo, flags);
6859
6860         /*
6861          * Update pg_control, using current time.  Check that it still shows
6862          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
6863          * this is a quick hack to make sure nothing really bad happens if
6864          * somehow we get here after the end-of-recovery checkpoint.
6865          */
6866         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6867         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
6868                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
6869         {
6870                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6871                 ControlFile->checkPoint = lastCheckPointRecPtr;
6872                 ControlFile->checkPointCopy = lastCheckPoint;
6873                 ControlFile->time = (pg_time_t) time(NULL);
6874                 UpdateControlFile();
6875         }
6876         LWLockRelease(ControlFileLock);
6877
6878         /*
6879          * Currently, there is no need to truncate pg_subtrans during recovery. If
6880          * we did do that, we will need to have called StartupSUBTRANS() already
6881          * and then TruncateSUBTRANS() would go here.
6882          */
6883
6884         /* All real work is done, but log before releasing lock. */
6885         if (log_checkpoints)
6886                 LogCheckpointEnd(true);
6887
6888         ereport((log_checkpoints ? LOG : DEBUG2),
6889                         (errmsg("recovery restart point at %X/%X",
6890                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
6891
6892         /* XXX this is currently BROKEN because we are in the wrong process */
6893         if (recoveryLastXTime)
6894                 ereport((log_checkpoints ? LOG : DEBUG2),
6895                                 (errmsg("last completed transaction was at log time %s",
6896                                                 timestamptz_to_str(recoveryLastXTime))));
6897
6898         LWLockRelease(CheckpointLock);
6899         return true;
6900 }
6901
6902 /*
6903  * Write a NEXTOID log record
6904  */
6905 void
6906 XLogPutNextOid(Oid nextOid)
6907 {
6908         XLogRecData rdata;
6909
6910         rdata.data = (char *) (&nextOid);
6911         rdata.len = sizeof(Oid);
6912         rdata.buffer = InvalidBuffer;
6913         rdata.next = NULL;
6914         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
6915
6916         /*
6917          * We need not flush the NEXTOID record immediately, because any of the
6918          * just-allocated OIDs could only reach disk as part of a tuple insert or
6919          * update that would have its own XLOG record that must follow the NEXTOID
6920          * record.      Therefore, the standard buffer LSN interlock applied to those
6921          * records will ensure no such OID reaches disk before the NEXTOID record
6922          * does.
6923          *
6924          * Note, however, that the above statement only covers state "within" the
6925          * database.  When we use a generated OID as a file or directory name, we
6926          * are in a sense violating the basic WAL rule, because that filesystem
6927          * change may reach disk before the NEXTOID WAL record does.  The impact
6928          * of this is that if a database crash occurs immediately afterward, we
6929          * might after restart re-generate the same OID and find that it conflicts
6930          * with the leftover file or directory.  But since for safety's sake we
6931          * always loop until finding a nonconflicting filename, this poses no real
6932          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
6933          */
6934 }
6935
6936 /*
6937  * Write an XLOG SWITCH record.
6938  *
6939  * Here we just blindly issue an XLogInsert request for the record.
6940  * All the magic happens inside XLogInsert.
6941  *
6942  * The return value is either the end+1 address of the switch record,
6943  * or the end+1 address of the prior segment if we did not need to
6944  * write a switch record because we are already at segment start.
6945  */
6946 XLogRecPtr
6947 RequestXLogSwitch(void)
6948 {
6949         XLogRecPtr      RecPtr;
6950         XLogRecData rdata;
6951
6952         /* XLOG SWITCH, alone among xlog record types, has no data */
6953         rdata.buffer = InvalidBuffer;
6954         rdata.data = NULL;
6955         rdata.len = 0;
6956         rdata.next = NULL;
6957
6958         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
6959
6960         return RecPtr;
6961 }
6962
6963 /*
6964  * XLOG resource manager's routines
6965  *
6966  * Definitions of info values are in include/catalog/pg_control.h, though
6967  * not all record types are related to control file updates.
6968  */
6969 void
6970 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
6971 {
6972         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6973
6974         /* Backup blocks are not used in xlog records */
6975         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
6976
6977         if (info == XLOG_NEXTOID)
6978         {
6979                 Oid                     nextOid;
6980
6981                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
6982                 if (ShmemVariableCache->nextOid < nextOid)
6983                 {
6984                         ShmemVariableCache->nextOid = nextOid;
6985                         ShmemVariableCache->oidCount = 0;
6986                 }
6987         }
6988         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
6989         {
6990                 CheckPoint      checkPoint;
6991
6992                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6993                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
6994                 ShmemVariableCache->nextXid = checkPoint.nextXid;
6995                 ShmemVariableCache->nextOid = checkPoint.nextOid;
6996                 ShmemVariableCache->oidCount = 0;
6997                 MultiXactSetNextMXact(checkPoint.nextMulti,
6998                                                           checkPoint.nextMultiOffset);
6999                 ShmemVariableCache->oldestXid = checkPoint.oldestXid;
7000                 ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
7001
7002                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7003                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7004                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7005
7006                 /*
7007                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
7008                  */
7009                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7010                 {
7011                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
7012                                 !list_member_int(expectedTLIs,
7013                                                                  (int) checkPoint.ThisTimeLineID))
7014                                 ereport(PANIC,
7015                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7016                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
7017                         /* Following WAL records should be run with new TLI */
7018                         ThisTimeLineID = checkPoint.ThisTimeLineID;
7019                 }
7020
7021                 RecoveryRestartPoint(&checkPoint);
7022         }
7023         else if (info == XLOG_CHECKPOINT_ONLINE)
7024         {
7025                 CheckPoint      checkPoint;
7026
7027                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7028                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
7029                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
7030                                                                   checkPoint.nextXid))
7031                         ShmemVariableCache->nextXid = checkPoint.nextXid;
7032                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
7033                 {
7034                         ShmemVariableCache->nextOid = checkPoint.nextOid;
7035                         ShmemVariableCache->oidCount = 0;
7036                 }
7037                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
7038                                                                   checkPoint.nextMultiOffset);
7039                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
7040                                                                   checkPoint.oldestXid))
7041                 {
7042                         ShmemVariableCache->oldestXid = checkPoint.oldestXid;
7043                         ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
7044                 }
7045
7046                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7047                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7048                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7049
7050                 /* TLI should not change in an on-line checkpoint */
7051                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7052                         ereport(PANIC,
7053                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
7054                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
7055
7056                 RecoveryRestartPoint(&checkPoint);
7057         }
7058         else if (info == XLOG_NOOP)
7059         {
7060                 /* nothing to do here */
7061         }
7062         else if (info == XLOG_SWITCH)
7063         {
7064                 /* nothing to do here */
7065         }
7066 }
7067
7068 void
7069 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
7070 {
7071         uint8           info = xl_info & ~XLR_INFO_MASK;
7072
7073         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
7074                 info == XLOG_CHECKPOINT_ONLINE)
7075         {
7076                 CheckPoint *checkpoint = (CheckPoint *) rec;
7077
7078                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
7079                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
7080                                                  "oldest xid %u in DB %u; %s",
7081                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
7082                                                  checkpoint->ThisTimeLineID,
7083                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
7084                                                  checkpoint->nextOid,
7085                                                  checkpoint->nextMulti,
7086                                                  checkpoint->nextMultiOffset,
7087                                                  checkpoint->oldestXid,
7088                                                  checkpoint->oldestXidDB,
7089                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
7090         }
7091         else if (info == XLOG_NOOP)
7092         {
7093                 appendStringInfo(buf, "xlog no-op");
7094         }
7095         else if (info == XLOG_NEXTOID)
7096         {
7097                 Oid                     nextOid;
7098
7099                 memcpy(&nextOid, rec, sizeof(Oid));
7100                 appendStringInfo(buf, "nextOid: %u", nextOid);
7101         }
7102         else if (info == XLOG_SWITCH)
7103         {
7104                 appendStringInfo(buf, "xlog switch");
7105         }
7106         else
7107                 appendStringInfo(buf, "UNKNOWN");
7108 }
7109
7110 #ifdef WAL_DEBUG
7111
7112 static void
7113 xlog_outrec(StringInfo buf, XLogRecord *record)
7114 {
7115         int                     i;
7116
7117         appendStringInfo(buf, "prev %X/%X; xid %u",
7118                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
7119                                          record->xl_xid);
7120
7121         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
7122         {
7123                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
7124                         appendStringInfo(buf, "; bkpb%d", i + 1);
7125         }
7126
7127         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
7128 }
7129 #endif   /* WAL_DEBUG */
7130
7131
7132 /*
7133  * Return the (possible) sync flag used for opening a file, depending on the
7134  * value of the GUC wal_sync_method.
7135  */
7136 static int
7137 get_sync_bit(int method)
7138 {
7139         /* If fsync is disabled, never open in sync mode */
7140         if (!enableFsync)
7141                 return 0;
7142
7143         switch (method)
7144         {
7145                         /*
7146                          * enum values for all sync options are defined even if they are
7147                          * not supported on the current platform.  But if not, they are
7148                          * not included in the enum option array, and therefore will never
7149                          * be seen here.
7150                          */
7151                 case SYNC_METHOD_FSYNC:
7152                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
7153                 case SYNC_METHOD_FDATASYNC:
7154                         return 0;
7155 #ifdef OPEN_SYNC_FLAG
7156                 case SYNC_METHOD_OPEN:
7157                         return OPEN_SYNC_FLAG;
7158 #endif
7159 #ifdef OPEN_DATASYNC_FLAG
7160                 case SYNC_METHOD_OPEN_DSYNC:
7161                         return OPEN_DATASYNC_FLAG;
7162 #endif
7163                 default:
7164                         /* can't happen (unless we are out of sync with option array) */
7165                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
7166                         return 0;                       /* silence warning */
7167         }
7168 }
7169
7170 /*
7171  * GUC support
7172  */
7173 bool
7174 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
7175 {
7176         if (!doit)
7177                 return true;
7178
7179         if (sync_method != new_sync_method)
7180         {
7181                 /*
7182                  * To ensure that no blocks escape unsynced, force an fsync on the
7183                  * currently open log segment (if any).  Also, if the open flag is
7184                  * changing, close the log file so it will be reopened (with new flag
7185                  * bit) at next use.
7186                  */
7187                 if (openLogFile >= 0)
7188                 {
7189                         if (pg_fsync(openLogFile) != 0)
7190                                 ereport(PANIC,
7191                                                 (errcode_for_file_access(),
7192                                                  errmsg("could not fsync log file %u, segment %u: %m",
7193                                                                 openLogId, openLogSeg)));
7194                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
7195                                 XLogFileClose();
7196                 }
7197         }
7198
7199         return true;
7200 }
7201
7202
7203 /*
7204  * Issue appropriate kind of fsync (if any) on the current XLOG output file
7205  */
7206 static void
7207 issue_xlog_fsync(void)
7208 {
7209         switch (sync_method)
7210         {
7211                 case SYNC_METHOD_FSYNC:
7212                         if (pg_fsync_no_writethrough(openLogFile) != 0)
7213                                 ereport(PANIC,
7214                                                 (errcode_for_file_access(),
7215                                                  errmsg("could not fsync log file %u, segment %u: %m",
7216                                                                 openLogId, openLogSeg)));
7217                         break;
7218 #ifdef HAVE_FSYNC_WRITETHROUGH
7219                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
7220                         if (pg_fsync_writethrough(openLogFile) != 0)
7221                                 ereport(PANIC,
7222                                                 (errcode_for_file_access(),
7223                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
7224                                                                 openLogId, openLogSeg)));
7225                         break;
7226 #endif
7227 #ifdef HAVE_FDATASYNC
7228                 case SYNC_METHOD_FDATASYNC:
7229                         if (pg_fdatasync(openLogFile) != 0)
7230                                 ereport(PANIC,
7231                                                 (errcode_for_file_access(),
7232                                         errmsg("could not fdatasync log file %u, segment %u: %m",
7233                                                    openLogId, openLogSeg)));
7234                         break;
7235 #endif
7236                 case SYNC_METHOD_OPEN:
7237                 case SYNC_METHOD_OPEN_DSYNC:
7238                         /* write synced it already */
7239                         break;
7240                 default:
7241                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
7242                         break;
7243         }
7244 }
7245
7246
7247 /*
7248  * pg_start_backup: set up for taking an on-line backup dump
7249  *
7250  * Essentially what this does is to create a backup label file in $PGDATA,
7251  * where it will be archived as part of the backup dump.  The label file
7252  * contains the user-supplied label string (typically this would be used
7253  * to tell where the backup dump will be stored) and the starting time and
7254  * starting WAL location for the dump.
7255  */
7256 Datum
7257 pg_start_backup(PG_FUNCTION_ARGS)
7258 {
7259         text       *backupid = PG_GETARG_TEXT_P(0);
7260         bool            fast = PG_GETARG_BOOL(1);
7261         char       *backupidstr;
7262         XLogRecPtr      checkpointloc;
7263         XLogRecPtr      startpoint;
7264         pg_time_t       stamp_time;
7265         char            strfbuf[128];
7266         char            xlogfilename[MAXFNAMELEN];
7267         uint32          _logId;
7268         uint32          _logSeg;
7269         struct stat stat_buf;
7270         FILE       *fp;
7271
7272         if (!superuser())
7273                 ereport(ERROR,
7274                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7275                                  errmsg("must be superuser to run a backup")));
7276
7277         if (!XLogArchivingActive())
7278                 ereport(ERROR,
7279                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7280                                  errmsg("WAL archiving is not active"),
7281                                  errhint("archive_mode must be enabled at server start.")));
7282
7283         if (!XLogArchiveCommandSet())
7284                 ereport(ERROR,
7285                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7286                                  errmsg("WAL archiving is not active"),
7287                                  errhint("archive_command must be defined before "
7288                                                  "online backups can be made safely.")));
7289
7290         backupidstr = text_to_cstring(backupid);
7291
7292         /*
7293          * Mark backup active in shared memory.  We must do full-page WAL writes
7294          * during an on-line backup even if not doing so at other times, because
7295          * it's quite possible for the backup dump to obtain a "torn" (partially
7296          * written) copy of a database page if it reads the page concurrently with
7297          * our write to the same page.  This can be fixed as long as the first
7298          * write to the page in the WAL sequence is a full-page write. Hence, we
7299          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
7300          * are no dirty pages in shared memory that might get dumped while the
7301          * backup is in progress without having a corresponding WAL record.  (Once
7302          * the backup is complete, we need not force full-page writes anymore,
7303          * since we expect that any pages not modified during the backup interval
7304          * must have been correctly captured by the backup.)
7305          *
7306          * We must hold WALInsertLock to change the value of forcePageWrites, to
7307          * ensure adequate interlocking against XLogInsert().
7308          */
7309         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7310         if (XLogCtl->Insert.forcePageWrites)
7311         {
7312                 LWLockRelease(WALInsertLock);
7313                 ereport(ERROR,
7314                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7315                                  errmsg("a backup is already in progress"),
7316                                  errhint("Run pg_stop_backup() and try again.")));
7317         }
7318         XLogCtl->Insert.forcePageWrites = true;
7319         LWLockRelease(WALInsertLock);
7320
7321         /*
7322          * Force an XLOG file switch before the checkpoint, to ensure that the WAL
7323          * segment the checkpoint is written to doesn't contain pages with old
7324          * timeline IDs. That would otherwise happen if you called
7325          * pg_start_backup() right after restoring from a PITR archive: the first
7326          * WAL segment containing the startup checkpoint has pages in the
7327          * beginning with the old timeline ID. That can cause trouble at recovery:
7328          * we won't have a history file covering the old timeline if pg_xlog
7329          * directory was not included in the base backup and the WAL archive was
7330          * cleared too before starting the backup.
7331          */
7332         RequestXLogSwitch();
7333
7334         /* Ensure we release forcePageWrites if fail below */
7335         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
7336         {
7337                 /*
7338                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
7339                  * page problems, this guarantees that two successive backup runs will
7340                  * have different checkpoint positions and hence different history
7341                  * file names, even if nothing happened in between.
7342                  *
7343                  * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
7344                  * fast = true).  Otherwise this can take awhile.
7345                  */
7346                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
7347                                                   (fast ? CHECKPOINT_IMMEDIATE : 0));
7348
7349                 /*
7350                  * Now we need to fetch the checkpoint record location, and also its
7351                  * REDO pointer.  The oldest point in WAL that would be needed to
7352                  * restore starting from the checkpoint is precisely the REDO pointer.
7353                  */
7354                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7355                 checkpointloc = ControlFile->checkPoint;
7356                 startpoint = ControlFile->checkPointCopy.redo;
7357                 LWLockRelease(ControlFileLock);
7358
7359                 XLByteToSeg(startpoint, _logId, _logSeg);
7360                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
7361
7362                 /* Use the log timezone here, not the session timezone */
7363                 stamp_time = (pg_time_t) time(NULL);
7364                 pg_strftime(strfbuf, sizeof(strfbuf),
7365                                         "%Y-%m-%d %H:%M:%S %Z",
7366                                         pg_localtime(&stamp_time, log_timezone));
7367
7368                 /*
7369                  * Check for existing backup label --- implies a backup is already
7370                  * running.  (XXX given that we checked forcePageWrites above, maybe
7371                  * it would be OK to just unlink any such label file?)
7372                  */
7373                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
7374                 {
7375                         if (errno != ENOENT)
7376                                 ereport(ERROR,
7377                                                 (errcode_for_file_access(),
7378                                                  errmsg("could not stat file \"%s\": %m",
7379                                                                 BACKUP_LABEL_FILE)));
7380                 }
7381                 else
7382                         ereport(ERROR,
7383                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7384                                          errmsg("a backup is already in progress"),
7385                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
7386                                                          BACKUP_LABEL_FILE)));
7387
7388                 /*
7389                  * Okay, write the file
7390                  */
7391                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
7392                 if (!fp)
7393                         ereport(ERROR,
7394                                         (errcode_for_file_access(),
7395                                          errmsg("could not create file \"%s\": %m",
7396                                                         BACKUP_LABEL_FILE)));
7397                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
7398                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
7399                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
7400                                 checkpointloc.xlogid, checkpointloc.xrecoff);
7401                 fprintf(fp, "START TIME: %s\n", strfbuf);
7402                 fprintf(fp, "LABEL: %s\n", backupidstr);
7403                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
7404                         ereport(ERROR,
7405                                         (errcode_for_file_access(),
7406                                          errmsg("could not write file \"%s\": %m",
7407                                                         BACKUP_LABEL_FILE)));
7408         }
7409         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
7410
7411         /*
7412          * We're done.  As a convenience, return the starting WAL location.
7413          */
7414         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
7415                          startpoint.xlogid, startpoint.xrecoff);
7416         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7417 }
7418
7419 /* Error cleanup callback for pg_start_backup */
7420 static void
7421 pg_start_backup_callback(int code, Datum arg)
7422 {
7423         /* Turn off forcePageWrites on failure */
7424         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7425         XLogCtl->Insert.forcePageWrites = false;
7426         LWLockRelease(WALInsertLock);
7427 }
7428
7429 /*
7430  * pg_stop_backup: finish taking an on-line backup dump
7431  *
7432  * We remove the backup label file created by pg_start_backup, and instead
7433  * create a backup history file in pg_xlog (whence it will immediately be
7434  * archived).  The backup history file contains the same info found in
7435  * the label file, plus the backup-end time and WAL location.
7436  * Note: different from CancelBackup which just cancels online backup mode.
7437  */
7438 Datum
7439 pg_stop_backup(PG_FUNCTION_ARGS)
7440 {
7441         XLogRecPtr      startpoint;
7442         XLogRecPtr      stoppoint;
7443         pg_time_t       stamp_time;
7444         char            strfbuf[128];
7445         char            histfilepath[MAXPGPATH];
7446         char            startxlogfilename[MAXFNAMELEN];
7447         char            stopxlogfilename[MAXFNAMELEN];
7448         char            lastxlogfilename[MAXFNAMELEN];
7449         char            histfilename[MAXFNAMELEN];
7450         uint32          _logId;
7451         uint32          _logSeg;
7452         FILE       *lfp;
7453         FILE       *fp;
7454         char            ch;
7455         int                     ich;
7456         int                     seconds_before_warning;
7457         int                     waits = 0;
7458
7459         if (!superuser())
7460                 ereport(ERROR,
7461                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7462                                  (errmsg("must be superuser to run a backup"))));
7463
7464         if (!XLogArchivingActive())
7465                 ereport(ERROR,
7466                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7467                                  errmsg("WAL archiving is not active"),
7468                                  errhint("archive_mode must be enabled at server start.")));
7469
7470         /*
7471          * OK to clear forcePageWrites
7472          */
7473         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7474         XLogCtl->Insert.forcePageWrites = false;
7475         LWLockRelease(WALInsertLock);
7476
7477         /*
7478          * Force a switch to a new xlog segment file, so that the backup is valid
7479          * as soon as archiver moves out the current segment file. We'll report
7480          * the end address of the XLOG SWITCH record as the backup stopping point.
7481          */
7482         stoppoint = RequestXLogSwitch();
7483
7484         XLByteToSeg(stoppoint, _logId, _logSeg);
7485         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
7486
7487         /* Use the log timezone here, not the session timezone */
7488         stamp_time = (pg_time_t) time(NULL);
7489         pg_strftime(strfbuf, sizeof(strfbuf),
7490                                 "%Y-%m-%d %H:%M:%S %Z",
7491                                 pg_localtime(&stamp_time, log_timezone));
7492
7493         /*
7494          * Open the existing label file
7495          */
7496         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7497         if (!lfp)
7498         {
7499                 if (errno != ENOENT)
7500                         ereport(ERROR,
7501                                         (errcode_for_file_access(),
7502                                          errmsg("could not read file \"%s\": %m",
7503                                                         BACKUP_LABEL_FILE)));
7504                 ereport(ERROR,
7505                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7506                                  errmsg("a backup is not in progress")));
7507         }
7508
7509         /*
7510          * Read and parse the START WAL LOCATION line (this code is pretty crude,
7511          * but we are not expecting any variability in the file format).
7512          */
7513         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
7514                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7515                            &ch) != 4 || ch != '\n')
7516                 ereport(ERROR,
7517                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7518                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7519
7520         /*
7521          * Write the backup history file
7522          */
7523         XLByteToSeg(startpoint, _logId, _logSeg);
7524         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
7525                                                   startpoint.xrecoff % XLogSegSize);
7526         fp = AllocateFile(histfilepath, "w");
7527         if (!fp)
7528                 ereport(ERROR,
7529                                 (errcode_for_file_access(),
7530                                  errmsg("could not create file \"%s\": %m",
7531                                                 histfilepath)));
7532         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
7533                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
7534         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
7535                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
7536         /* transfer remaining lines from label to history file */
7537         while ((ich = fgetc(lfp)) != EOF)
7538                 fputc(ich, fp);
7539         fprintf(fp, "STOP TIME: %s\n", strfbuf);
7540         if (fflush(fp) || ferror(fp) || FreeFile(fp))
7541                 ereport(ERROR,
7542                                 (errcode_for_file_access(),
7543                                  errmsg("could not write file \"%s\": %m",
7544                                                 histfilepath)));
7545
7546         /*
7547          * Close and remove the backup label file
7548          */
7549         if (ferror(lfp) || FreeFile(lfp))
7550                 ereport(ERROR,
7551                                 (errcode_for_file_access(),
7552                                  errmsg("could not read file \"%s\": %m",
7553                                                 BACKUP_LABEL_FILE)));
7554         if (unlink(BACKUP_LABEL_FILE) != 0)
7555                 ereport(ERROR,
7556                                 (errcode_for_file_access(),
7557                                  errmsg("could not remove file \"%s\": %m",
7558                                                 BACKUP_LABEL_FILE)));
7559
7560         /*
7561          * Clean out any no-longer-needed history files.  As a side effect, this
7562          * will post a .ready file for the newly created history file, notifying
7563          * the archiver that history file may be archived immediately.
7564          */
7565         CleanupBackupHistory();
7566
7567         /*
7568          * Wait until both the last WAL file filled during backup and the history
7569          * file have been archived.  We assume that the alphabetic sorting
7570          * property of the WAL files ensures any earlier WAL files are safely
7571          * archived as well.
7572          *
7573          * We wait forever, since archive_command is supposed to work and we
7574          * assume the admin wanted his backup to work completely. If you don't
7575          * wish to wait, you can set statement_timeout.
7576          */
7577         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
7578         XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
7579
7580         XLByteToSeg(startpoint, _logId, _logSeg);
7581         BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
7582                                                   startpoint.xrecoff % XLogSegSize);
7583
7584         seconds_before_warning = 60;
7585         waits = 0;
7586
7587         while (XLogArchiveIsBusy(lastxlogfilename) ||
7588                    XLogArchiveIsBusy(histfilename))
7589         {
7590                 CHECK_FOR_INTERRUPTS();
7591
7592                 pg_usleep(1000000L);
7593
7594                 if (++waits >= seconds_before_warning)
7595                 {
7596                         seconds_before_warning *= 2;            /* This wraps in >10 years... */
7597                         ereport(WARNING,
7598                                         (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
7599                                                         waits)));
7600                 }
7601         }
7602
7603         /*
7604          * We're done.  As a convenience, return the ending WAL location.
7605          */
7606         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
7607                          stoppoint.xlogid, stoppoint.xrecoff);
7608         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
7609 }
7610
7611 /*
7612  * pg_switch_xlog: switch to next xlog file
7613  */
7614 Datum
7615 pg_switch_xlog(PG_FUNCTION_ARGS)
7616 {
7617         XLogRecPtr      switchpoint;
7618         char            location[MAXFNAMELEN];
7619
7620         if (!superuser())
7621                 ereport(ERROR,
7622                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7623                          (errmsg("must be superuser to switch transaction log files"))));
7624
7625         switchpoint = RequestXLogSwitch();
7626
7627         /*
7628          * As a convenience, return the WAL location of the switch record
7629          */
7630         snprintf(location, sizeof(location), "%X/%X",
7631                          switchpoint.xlogid, switchpoint.xrecoff);
7632         PG_RETURN_TEXT_P(cstring_to_text(location));
7633 }
7634
7635 /*
7636  * Report the current WAL write location (same format as pg_start_backup etc)
7637  *
7638  * This is useful for determining how much of WAL is visible to an external
7639  * archiving process.  Note that the data before this point is written out
7640  * to the kernel, but is not necessarily synced to disk.
7641  */
7642 Datum
7643 pg_current_xlog_location(PG_FUNCTION_ARGS)
7644 {
7645         char            location[MAXFNAMELEN];
7646
7647         /* Make sure we have an up-to-date local LogwrtResult */
7648         {
7649                 /* use volatile pointer to prevent code rearrangement */
7650                 volatile XLogCtlData *xlogctl = XLogCtl;
7651
7652                 SpinLockAcquire(&xlogctl->info_lck);
7653                 LogwrtResult = xlogctl->LogwrtResult;
7654                 SpinLockRelease(&xlogctl->info_lck);
7655         }
7656
7657         snprintf(location, sizeof(location), "%X/%X",
7658                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
7659         PG_RETURN_TEXT_P(cstring_to_text(location));
7660 }
7661
7662 /*
7663  * Report the current WAL insert location (same format as pg_start_backup etc)
7664  *
7665  * This function is mostly for debugging purposes.
7666  */
7667 Datum
7668 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
7669 {
7670         XLogCtlInsert *Insert = &XLogCtl->Insert;
7671         XLogRecPtr      current_recptr;
7672         char            location[MAXFNAMELEN];
7673
7674         /*
7675          * Get the current end-of-WAL position ... shared lock is sufficient
7676          */
7677         LWLockAcquire(WALInsertLock, LW_SHARED);
7678         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
7679         LWLockRelease(WALInsertLock);
7680
7681         snprintf(location, sizeof(location), "%X/%X",
7682                          current_recptr.xlogid, current_recptr.xrecoff);
7683         PG_RETURN_TEXT_P(cstring_to_text(location));
7684 }
7685
7686 /*
7687  * Compute an xlog file name and decimal byte offset given a WAL location,
7688  * such as is returned by pg_stop_backup() or pg_xlog_switch().
7689  *
7690  * Note that a location exactly at a segment boundary is taken to be in
7691  * the previous segment.  This is usually the right thing, since the
7692  * expected usage is to determine which xlog file(s) are ready to archive.
7693  */
7694 Datum
7695 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
7696 {
7697         text       *location = PG_GETARG_TEXT_P(0);
7698         char       *locationstr;
7699         unsigned int uxlogid;
7700         unsigned int uxrecoff;
7701         uint32          xlogid;
7702         uint32          xlogseg;
7703         uint32          xrecoff;
7704         XLogRecPtr      locationpoint;
7705         char            xlogfilename[MAXFNAMELEN];
7706         Datum           values[2];
7707         bool            isnull[2];
7708         TupleDesc       resultTupleDesc;
7709         HeapTuple       resultHeapTuple;
7710         Datum           result;
7711
7712         /*
7713          * Read input and parse
7714          */
7715         locationstr = text_to_cstring(location);
7716
7717         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7718                 ereport(ERROR,
7719                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7720                                  errmsg("could not parse transaction log location \"%s\"",
7721                                                 locationstr)));
7722
7723         locationpoint.xlogid = uxlogid;
7724         locationpoint.xrecoff = uxrecoff;
7725
7726         /*
7727          * Construct a tuple descriptor for the result row.  This must match this
7728          * function's pg_proc entry!
7729          */
7730         resultTupleDesc = CreateTemplateTupleDesc(2, false);
7731         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
7732                                            TEXTOID, -1, 0);
7733         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
7734                                            INT4OID, -1, 0);
7735
7736         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
7737
7738         /*
7739          * xlogfilename
7740          */
7741         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7742         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7743
7744         values[0] = CStringGetTextDatum(xlogfilename);
7745         isnull[0] = false;
7746
7747         /*
7748          * offset
7749          */
7750         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
7751
7752         values[1] = UInt32GetDatum(xrecoff);
7753         isnull[1] = false;
7754
7755         /*
7756          * Tuple jam: Having first prepared your Datums, then squash together
7757          */
7758         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
7759
7760         result = HeapTupleGetDatum(resultHeapTuple);
7761
7762         PG_RETURN_DATUM(result);
7763 }
7764
7765 /*
7766  * Compute an xlog file name given a WAL location,
7767  * such as is returned by pg_stop_backup() or pg_xlog_switch().
7768  */
7769 Datum
7770 pg_xlogfile_name(PG_FUNCTION_ARGS)
7771 {
7772         text       *location = PG_GETARG_TEXT_P(0);
7773         char       *locationstr;
7774         unsigned int uxlogid;
7775         unsigned int uxrecoff;
7776         uint32          xlogid;
7777         uint32          xlogseg;
7778         XLogRecPtr      locationpoint;
7779         char            xlogfilename[MAXFNAMELEN];
7780
7781         locationstr = text_to_cstring(location);
7782
7783         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7784                 ereport(ERROR,
7785                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7786                                  errmsg("could not parse transaction log location \"%s\"",
7787                                                 locationstr)));
7788
7789         locationpoint.xlogid = uxlogid;
7790         locationpoint.xrecoff = uxrecoff;
7791
7792         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7793         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7794
7795         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7796 }
7797
7798 /*
7799  * read_backup_label: check to see if a backup_label file is present
7800  *
7801  * If we see a backup_label during recovery, we assume that we are recovering
7802  * from a backup dump file, and we therefore roll forward from the checkpoint
7803  * identified by the label file, NOT what pg_control says.      This avoids the
7804  * problem that pg_control might have been archived one or more checkpoints
7805  * later than the start of the dump, and so if we rely on it as the start
7806  * point, we will fail to restore a consistent database state.
7807  *
7808  * We also attempt to retrieve the corresponding backup history file.
7809  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
7810  * points.
7811  *
7812  * Returns TRUE if a backup_label was found (and fills the checkpoint
7813  * location into *checkPointLoc); returns FALSE if not.
7814  */
7815 static bool
7816 read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
7817 {
7818         XLogRecPtr      startpoint;
7819         XLogRecPtr      stoppoint;
7820         char            histfilename[MAXFNAMELEN];
7821         char            histfilepath[MAXPGPATH];
7822         char            startxlogfilename[MAXFNAMELEN];
7823         char            stopxlogfilename[MAXFNAMELEN];
7824         TimeLineID      tli;
7825         uint32          _logId;
7826         uint32          _logSeg;
7827         FILE       *lfp;
7828         FILE       *fp;
7829         char            ch;
7830
7831         /* Default is to not constrain recovery stop point */
7832         minRecoveryLoc->xlogid = 0;
7833         minRecoveryLoc->xrecoff = 0;
7834
7835         /*
7836          * See if label file is present
7837          */
7838         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7839         if (!lfp)
7840         {
7841                 if (errno != ENOENT)
7842                         ereport(FATAL,
7843                                         (errcode_for_file_access(),
7844                                          errmsg("could not read file \"%s\": %m",
7845                                                         BACKUP_LABEL_FILE)));
7846                 return false;                   /* it's not there, all is fine */
7847         }
7848
7849         /*
7850          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
7851          * is pretty crude, but we are not expecting any variability in the file
7852          * format).
7853          */
7854         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
7855                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
7856                            startxlogfilename, &ch) != 5 || ch != '\n')
7857                 ereport(FATAL,
7858                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7859                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7860         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
7861                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
7862                            &ch) != 3 || ch != '\n')
7863                 ereport(FATAL,
7864                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7865                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7866         if (ferror(lfp) || FreeFile(lfp))
7867                 ereport(FATAL,
7868                                 (errcode_for_file_access(),
7869                                  errmsg("could not read file \"%s\": %m",
7870                                                 BACKUP_LABEL_FILE)));
7871
7872         /*
7873          * Try to retrieve the backup history file (no error if we can't)
7874          */
7875         XLByteToSeg(startpoint, _logId, _logSeg);
7876         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
7877                                                   startpoint.xrecoff % XLogSegSize);
7878
7879         if (InArchiveRecovery)
7880                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
7881         else
7882                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
7883                                                           startpoint.xrecoff % XLogSegSize);
7884
7885         fp = AllocateFile(histfilepath, "r");
7886         if (fp)
7887         {
7888                 /*
7889                  * Parse history file to identify stop point.
7890                  */
7891                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
7892                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7893                                    &ch) != 4 || ch != '\n')
7894                         ereport(FATAL,
7895                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7896                                          errmsg("invalid data in file \"%s\"", histfilename)));
7897                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
7898                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
7899                                    &ch) != 4 || ch != '\n')
7900                         ereport(FATAL,
7901                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7902                                          errmsg("invalid data in file \"%s\"", histfilename)));
7903                 *minRecoveryLoc = stoppoint;
7904                 if (ferror(fp) || FreeFile(fp))
7905                         ereport(FATAL,
7906                                         (errcode_for_file_access(),
7907                                          errmsg("could not read file \"%s\": %m",
7908                                                         histfilepath)));
7909         }
7910
7911         return true;
7912 }
7913
7914 /*
7915  * Error context callback for errors occurring during rm_redo().
7916  */
7917 static void
7918 rm_redo_error_callback(void *arg)
7919 {
7920         XLogRecord *record = (XLogRecord *) arg;
7921         StringInfoData buf;
7922
7923         initStringInfo(&buf);
7924         RmgrTable[record->xl_rmid].rm_desc(&buf,
7925                                                                            record->xl_info,
7926                                                                            XLogRecGetData(record));
7927
7928         /* don't bother emitting empty description */
7929         if (buf.len > 0)
7930                 errcontext("xlog redo %s", buf.data);
7931
7932         pfree(buf.data);
7933 }
7934
7935 /*
7936  * BackupInProgress: check if online backup mode is active
7937  *
7938  * This is done by checking for existence of the "backup_label" file.
7939  */
7940 bool
7941 BackupInProgress(void)
7942 {
7943         struct stat stat_buf;
7944
7945         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
7946 }
7947
7948 /*
7949  * CancelBackup: rename the "backup_label" file to cancel backup mode
7950  *
7951  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
7952  * Note that this will render an online backup in progress useless.
7953  * To correctly finish an online backup, pg_stop_backup must be called.
7954  */
7955 void
7956 CancelBackup(void)
7957 {
7958         struct stat stat_buf;
7959
7960         /* if the file is not there, return */
7961         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
7962                 return;
7963
7964         /* remove leftover file from previously cancelled backup if it exists */
7965         unlink(BACKUP_LABEL_OLD);
7966
7967         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
7968         {
7969                 ereport(LOG,
7970                                 (errmsg("online backup mode cancelled"),
7971                                  errdetail("\"%s\" was renamed to \"%s\".",
7972                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7973         }
7974         else
7975         {
7976                 ereport(WARNING,
7977                                 (errcode_for_file_access(),
7978                                  errmsg("online backup mode was not cancelled"),
7979                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
7980                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7981         }
7982 }
7983
7984 /* ------------------------------------------------------
7985  *      Startup Process main entry point and signal handlers
7986  * ------------------------------------------------------
7987  */
7988
7989 /*
7990  * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
7991  *
7992  * Some backend has bought the farm,
7993  * so we need to stop what we're doing and exit.
7994  */
7995 static void
7996 startupproc_quickdie(SIGNAL_ARGS)
7997 {
7998         PG_SETMASK(&BlockSig);
7999
8000         /*
8001          * We DO NOT want to run proc_exit() callbacks -- we're here because
8002          * shared memory may be corrupted, so we don't want to try to clean up our
8003          * transaction.  Just nail the windows shut and get out of town.  Now that
8004          * there's an atexit callback to prevent third-party code from breaking
8005          * things by calling exit() directly, we have to reset the callbacks
8006          * explicitly to make this work as intended.
8007          */
8008         on_exit_reset();
8009
8010         /*
8011          * Note we do exit(2) not exit(0).      This is to force the postmaster into a
8012          * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
8013          * backend.  This is necessary precisely because we don't clean up our
8014          * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
8015          * should ensure the postmaster sees this as a crash, too, but no harm in
8016          * being doubly sure.)
8017          */
8018         exit(2);
8019 }
8020
8021
8022 /* SIGHUP: set flag to re-read config file at next convenient time */
8023 static void
8024 StartupProcSigHupHandler(SIGNAL_ARGS)
8025 {
8026         got_SIGHUP = true;
8027 }
8028
8029 /* SIGTERM: set flag to abort redo and exit */
8030 static void
8031 StartupProcShutdownHandler(SIGNAL_ARGS)
8032 {
8033         if (in_restore_command)
8034                 proc_exit(1);
8035         else
8036                 shutdown_requested = true;
8037 }
8038
8039 /* Main entry point for startup process */
8040 void
8041 StartupProcessMain(void)
8042 {
8043         /*
8044          * If possible, make this process a group leader, so that the postmaster
8045          * can signal any child processes too.
8046          */
8047 #ifdef HAVE_SETSID
8048         if (setsid() < 0)
8049                 elog(FATAL, "setsid() failed: %m");
8050 #endif
8051
8052         /*
8053          * Properly accept or ignore signals the postmaster might send us
8054          */
8055         pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
8056         pqsignal(SIGINT, SIG_IGN);      /* ignore query cancel */
8057         pqsignal(SIGTERM, StartupProcShutdownHandler);          /* request shutdown */
8058         pqsignal(SIGQUIT, startupproc_quickdie);        /* hard crash time */
8059         pqsignal(SIGALRM, SIG_IGN);
8060         pqsignal(SIGPIPE, SIG_IGN);
8061         pqsignal(SIGUSR1, SIG_IGN);
8062         pqsignal(SIGUSR2, SIG_IGN);
8063
8064         /*
8065          * Reset some signals that are accepted by postmaster but not here
8066          */
8067         pqsignal(SIGCHLD, SIG_DFL);
8068         pqsignal(SIGTTIN, SIG_DFL);
8069         pqsignal(SIGTTOU, SIG_DFL);
8070         pqsignal(SIGCONT, SIG_DFL);
8071         pqsignal(SIGWINCH, SIG_DFL);
8072
8073         /*
8074          * Unblock signals (they were blocked when the postmaster forked us)
8075          */
8076         PG_SETMASK(&UnBlockSig);
8077
8078         StartupXLOG();
8079
8080         BuildFlatFiles(false);
8081
8082         /*
8083          * Exit normally. Exit code 0 tells postmaster that we completed recovery
8084          * successfully.
8085          */
8086         proc_exit(0);
8087 }