* PostgreSQL transaction log manager
*
*
- * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/xlog.c
#include "access/clog.h"
#include "access/multixact.h"
+#include "access/rewriteheap.h"
#include "access/subtrans.h"
#include "access/timeline.h"
#include "access/transam.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
-#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
+#include "replication/logical.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
+#include "storage/barrier.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+#include "storage/large_object.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/predicate.h"
#include "utils/timestamp.h"
#include "pg_trace.h"
+extern uint32 bootstrap_data_checksum_version;
/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
-#define PROMOTE_SIGNAL_FILE "promote"
-#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
+#define PROMOTE_SIGNAL_FILE "promote"
+#define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
/* User-settable parameters */
char *XLogArchiveCommand = NULL;
bool EnableHotStandby = false;
bool fullPageWrites = true;
+bool wal_log_hints = false;
bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+int num_xloginsert_locks = 8;
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
* future XLOG segment as long as there aren't already XLOGfileslop future
* segments; else we'll delete it. This could be made a separate GUC
* variable, but at present I think it's sufficient to hardwire it as
- * 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
+ * 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
* no more than 2*CheckPointSegments log segments, and we want to recycle all
* of them; the +1 allows boundary cases to happen without wasting a
* delete/create-segment cycle.
/*
* Statistics for current checkpoint are collected in this global struct.
- * Because only the background writer or a stand-alone backend can perform
+ * Because only the checkpointer or a stand-alone backend can perform
* checkpoints, this will be unused in normal backends.
*/
CheckpointStatsData CheckpointStats;
* 0: unconditionally not allowed to insert XLOG
* -1: must check RecoveryInProgress(); disallow until it is false
* Most processes start with -1 and transition to 1 after seeing that recovery
- * is not in progress. But we can also force the value for special cases.
+ * is not in progress. But we can also force the value for special cases.
* The coding in XLogInsertAllowed() depends on the first two of these states
* being numerically the same as bool true and false.
*/
* will switch to using offline XLOG archives as soon as we reach the end of
* WAL in pg_xlog.
*/
-bool ArchiveRecoveryRequested = false;
-bool InArchiveRecovery = false;
+bool ArchiveRecoveryRequested = false;
+bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
/* options taken from recovery.conf for archive recovery */
-char *recoveryRestoreCommand = NULL;
+char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
static char *archiveCleanupCommand = NULL;
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static char *recoveryTargetName;
+static int recovery_min_apply_delay = 0;
+static TimestampTz recoveryDelayUntilTime;
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyModeRequested = false;
static char *PrimaryConnInfo = NULL;
+static char *PrimarySlotName = NULL;
static char *TriggerFile = NULL;
/* are we currently in standby mode? */
-bool StandbyMode = false;
+bool StandbyMode = false;
/* whether request for fast promotion has been made yet */
static bool fast_promote = false;
-/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
static TransactionId recoveryStopXid;
static TimestampTz recoveryStopTime;
static char recoveryStopName[MAXFNAMELEN];
*
* expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
* its known parents, newest first (so recoveryTargetTLI is always the
- * first list member). Only these TLIs are expected to be seen in the WAL
+ * first list member). Only these TLIs are expected to be seen in the WAL
* segments we read, and indeed only these TLIs will be considered as
* candidate WAL files to open at all.
*
/*
* RedoRecPtr is this backend's local copy of the REDO record pointer
* (which is almost but not quite the same as a pointer to the most recent
- * CHECKPOINT record). We update this from the shared-memory copy,
+ * CHECKPOINT record). We update this from the shared-memory copy,
* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
- * hold the Insert lock). See XLogInsert for details. We are also allowed
- * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
+ * hold an insertion lock). See XLogInsert for details. We are also allowed
+ * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
* see GetRedoRecPtr. A freshly spawned backend obtains the value during
* InitXLOGAccess.
*/
* so it's a plain spinlock. The other locks are held longer (potentially
* over I/O operations), so we use LWLocks for them. These locks are:
*
- * WALInsertLock: must be held to insert a record into the WAL buffers.
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
+ * It is only held while initializing and changing the mapping. If the
+ * contents of the buffer being replaced haven't been written yet, the mapping
+ * lock is released while the write is done, and reacquired afterwards.
*
* WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
* XLogFlush).
XLogRecPtr Flush; /* last byte + 1 flushed */
} XLogwrtResult;
+/*
+ * Inserting to WAL is protected by a small fixed number of WAL insertion
+ * locks. To insert to the WAL, you must hold one of the locks - it doesn't
+ * matter which one. To lock out other concurrent insertions, you must hold
+ * of them. Each WAL insertion lock consists of a lightweight lock, plus an
+ * indicator of how far the insertion has progressed (insertingAt).
+ *
+ * The insertingAt values are read when a process wants to flush WAL from
+ * the in-memory buffers to disk, to check that all the insertions to the
+ * region the process is about to write out have finished. You could simply
+ * wait for all currently in-progress insertions to finish, but the
+ * insertingAt indicator allows you to ignore insertions to later in the WAL,
+ * so that you only wait for the insertions that are modifying the buffers
+ * you're about to write out.
+ *
+ * This isn't just an optimization. If all the WAL buffers are dirty, an
+ * inserter that's holding a WAL insert lock might need to evict an old WAL
+ * buffer, which requires flushing the WAL. If it's possible for an inserter
+ * to block on another inserter unnecessarily, deadlock can arise when two
+ * inserters holding a WAL insert lock wait for each other to finish their
+ * insertion.
+ *
+ * Small WAL records that don't cross a page boundary never update the value,
+ * the WAL record is just copied to the page and the lock is released. But
+ * to avoid the deadlock-scenario explained above, the indicator is always
+ * updated before sleeping while holding an insertion lock.
+ */
+typedef struct
+{
+ LWLock lock;
+ XLogRecPtr insertingAt;
+} WALInsertLock;
+
+/*
+ * All the WAL insertion locks are allocated as an array in shared memory. We
+ * force the array stride to be a power of 2, which saves a few cycles in
+ * indexing, but more importantly also ensures that individual slots don't
+ * cross cache line boundaries. (Of course, we have to also ensure that the
+ * array start address is suitably aligned.)
+ */
+typedef union WALInsertLockPadded
+{
+ WALInsertLock l;
+ char pad[CACHE_LINE_SIZE];
+} WALInsertLockPadded;
+
/*
* Shared state data for XLogInsert.
*/
typedef struct XLogCtlInsert
{
- XLogRecPtr PrevRecord; /* start of previously-inserted record */
- int curridx; /* current block index in cache */
- XLogPageHeader currpage; /* points to header of block in cache */
- char *currpos; /* current insertion point in cache */
- XLogRecPtr RedoRecPtr; /* current redo point for insertions */
- bool forcePageWrites; /* forcing full-page writes for PITR? */
+ slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
+
+ /*
+ * CurrBytePos is the end of reserved WAL. The next record will be
+ * inserted at that position. PrevBytePos is the start position of the
+ * previously inserted (or rather, reserved) record - it is copied to the
+ * prev-link of the next record. These are stored as "usable byte
+ * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
+ */
+ uint64 CurrBytePos;
+ uint64 PrevBytePos;
+
+ /*
+ * Make sure the above heavily-contended spinlock and byte positions are
+ * on their own cache line. In particular, the RedoRecPtr and full page
+ * write variables below should be on a different cache line. They are
+ * read on every WAL insertion, but updated rarely, and we don't want
+ * those reads to steal the cache line containing Curr/PrevBytePos.
+ */
+ char pad[CACHE_LINE_SIZE];
/*
* fullPageWrites is the master copy used by all backends to determine
* This is required because, when full_page_writes is changed by SIGHUP,
* we must WAL-log it before it actually affects WAL-logging by backends.
* Checkpointer sets at startup or after SIGHUP.
+ *
+ * To read these fields, you must hold an insertion lock. To modify them,
+ * you must hold ALL the locks.
*/
+ XLogRecPtr RedoRecPtr; /* current redo point for insertions */
+ bool forcePageWrites; /* forcing full-page writes for PITR? */
bool fullPageWrites;
/*
bool exclusiveBackup;
int nonExclusiveBackups;
XLogRecPtr lastBackupStart;
-} XLogCtlInsert;
-/*
- * Shared state data for XLogWrite/XLogFlush.
- */
-typedef struct XLogCtlWrite
-{
- int curridx; /* cache index of next block to write */
- pg_time_t lastSegSwitchTime; /* time of last xlog segment switch */
-} XLogCtlWrite;
+ /*
+ * WAL insertion locks.
+ */
+ WALInsertLockPadded *WALInsertLocks;
+ LWLockTranche WALInsertLockTranche;
+ int WALInsertLockTrancheId;
+} XLogCtlInsert;
/*
* Total shared-memory state for XLOG.
*/
typedef struct XLogCtlData
{
- /* Protected by WALInsertLock: */
XLogCtlInsert Insert;
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
+ XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
- XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
+ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
+
+ XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG
+ * segment */
- /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
- XLogRecPtr unloggedLSN;
+ /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
+ XLogRecPtr unloggedLSN;
slock_t ulsn_lck;
- /* Protected by WALWriteLock: */
- XLogCtlWrite Write;
+ /* Time of last xlog segment switch. Protected by WALWriteLock. */
+ pg_time_t lastSegSwitchTime;
/*
* Protected by info_lck and WALWriteLock (you must hold either lock to
*/
XLogwrtResult LogwrtResult;
+ /*
+ * Latest initialized page in the cache (last byte position + 1).
+ *
+ * To change the identity of a buffer (and InitializedUpTo), you need to
+ * hold WALBufMappingLock. To change the identity of a buffer that's
+ * still dirty, the old page needs to be written out first, and for that
+ * you need WALWriteLock, and you need to ensure that there are no
+ * in-progress insertions to the page by calling
+ * WaitXLogInsertionsToFinish().
+ */
+ XLogRecPtr InitializedUpTo;
+
/*
* These values do not change after startup, although the pointed-to pages
- * and xlblocks values certainly do. Permission to read/write the pages
- * and xlblocks values depends on WALInsertLock and WALWriteLock.
+ * and xlblocks values certainly do. xlblock values are protected by
+ * WALBufMappingLock.
*/
char *pages; /* buffers for unwritten XLOG pages */
XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
static XLogCtlData *XLogCtl = NULL;
+/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
+static WALInsertLockPadded *WALInsertLocks = NULL;
+
/*
* We maintain an image of pg_control in shared memory.
*/
static ControlFileData *ControlFile = NULL;
/*
- * Macros for managing XLogInsert state. In most cases, the calling routine
- * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
- * so these are passed as parameters instead of being fetched via XLogCtl.
+ * Calculate the amount of space left on the page after 'endptr'. Beware
+ * multiple evaluation!
*/
+#define INSERT_FREESPACE(endptr) \
+ (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
-/* Free space remaining in the current xlog page buffer */
-#define INSERT_FREESPACE(Insert) \
- (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
-
-/* Construct XLogRecPtr value for current insertion point */
-#define INSERT_RECPTR(recptr,Insert,curridx) \
- (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
-
-#define PrevBufIdx(idx) \
- (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
-
+/* Macro to advance to next buffer index. */
#define NextBufIdx(idx) \
(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
+/*
+ * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
+ * would hold if it was in cache, the page containing 'recptr'.
+ */
+#define XLogRecPtrToBufIdx(recptr) \
+ (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
+
+/*
+ * These are the number of bytes in a WAL page and segment usable for WAL data.
+ */
+#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
+#define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
+
/*
* Private, possibly out-of-date copy of shared LogwrtResult.
* See discussion above.
*/
typedef enum
{
- XLOG_FROM_ANY = 0, /* request to read WAL from any source */
- XLOG_FROM_ARCHIVE, /* restored using restore_command */
- XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */
- XLOG_FROM_STREAM, /* streamed from master */
+ XLOG_FROM_ANY = 0, /* request to read WAL from any source */
+ XLOG_FROM_ARCHIVE, /* restored using restore_command */
+ XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */
+ XLOG_FROM_STREAM /* streamed from master */
} XLogSource;
/* human-readable names for XLogSources, for debugging output */
-static const char *xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" };
+static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
/*
* openLogFile is -1 or a kernel FD for an open log file segment.
* next.
*/
static XLogSource currentSource = 0; /* XLOG_FROM_* code */
-static bool lastSourceFailed = false;
+static bool lastSourceFailed = false;
typedef struct XLogPageReadPrivate
{
* XLogReceiptSource tracks where we last successfully read some WAL.)
*/
static TimestampTz XLogReceiptTime = 0;
-static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
+static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr; /* start of last record read */
/* Have we launched bgwriter during recovery? */
static bool bgwriterLaunched = false;
+/* For WALInsertLockAcquire/Release functions */
+static int MyLockNo = 0;
+static bool holdingAllLocks = false;
static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
-static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static bool recoveryStopsBefore(XLogRecord *record);
+static bool recoveryStopsAfter(XLogRecord *record);
static void recoveryPausesHere(void);
+static bool recoveryApplyDelay(XLogRecord *record);
static void SetLatestXTime(TimestampTz xtime);
static void SetCurrentChunkStartTime(TimestampTz xtime);
static void CheckRequiredParameterValues(void);
static void CreateEndOfRecoveryRecord(void);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
+static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
-static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
+static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
XLogRecPtr *lsn, BkpBlock *bkpb);
-static bool AdvanceXLInsertBuffer(bool new_segment);
+static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
+ char *blk, bool get_cleanup_lock, bool keep_buffer);
+static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
static bool XLogCheckpointNeeded(XLogSegNo new_segno);
-static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
+static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
bool find_free, int *max_advance,
bool use_lock);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
int source, bool notexistOk);
-static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
+static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
TimeLineID *readTLI);
static void rm_redo_error_callback(void *arg);
static int get_sync_bit(int method);
+static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
+ XLogRecData *rdata,
+ XLogRecPtr StartPos, XLogRecPtr EndPos);
+static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
+ XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
+static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+ XLogRecPtr *PrevPtr);
+static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
+static char *GetXLogBuffer(XLogRecPtr ptr);
+static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
+static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
+static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+
+static void WALInsertLockAcquire(void);
+static void WALInsertLockAcquireExclusive(void);
+static void WALInsertLockRelease(void);
+static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
/*
* Insert an XLOG record having the specified RMID and info bytes,
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
- XLogRecPtr RecPtr;
- XLogRecPtr WriteRqst;
- uint32 freespace;
- int curridx;
XLogRecData *rdt;
XLogRecData *rdt_lastnormal;
Buffer dtbuf[XLR_MAX_BKP_BLOCKS];
uint32 len,
write_len;
unsigned i;
- bool updrqst;
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ bool inserted;
uint8 info_orig = info;
static XLogRecord *rechdr;
+ XLogRecPtr StartPos;
+ XLogRecPtr EndPos;
if (rechdr == NULL)
{
- rechdr = malloc(SizeOfXLogRecord);
- if (rechdr == NULL)
- elog(ERROR, "out of memory");
+ static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF];
+
+ rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf);
MemSet(rechdr, 0, SizeOfXLogRecord);
}
*/
if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
{
- RecPtr = SizeOfXLogLongPHD; /* start of 1st chkpt record */
- return RecPtr;
+ EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */
+ return EndPos;
}
/*
* up.
*
* We may have to loop back to here if a race condition is detected below.
- * We could prevent the race by doing all this work while holding the
- * insert lock, but it seems better to avoid doing CRC calculations while
- * holding the lock.
+ * We could prevent the race by doing all this work while holding an
+ * insertion lock, but it seems better to avoid doing CRC calculations
+ * while holding one.
*
* We add entries for backup blocks to the chain, so that they don't need
* any special treatment in the critical section where the chunks are
/*
* Decide if we need to do full-page writes in this XLOG record: true if
* full_page_writes is on or we have a PITR request for it. Since we
- * don't yet have the insert lock, fullPageWrites and forcePageWrites
- * could change under us, but we'll recheck them once we have the lock.
+ * don't yet have an insertion lock, fullPageWrites and forcePageWrites
+ * could change under us, but we'll recheck them once we have a lock.
*/
doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
{
/* OK, put it in this slot */
dtbuf[i] = rdt->buffer;
- if (XLogCheckBuffer(rdt, doPageWrites,
- &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
+ if (doPageWrites && XLogCheckBuffer(rdt, true,
+ &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
{
dtbuf_bkp[i] = true;
rdt->data = NULL;
COMP_CRC32(rdata_crc, rdt->data, rdt->len);
/*
- * Construct record header (prev-link and CRC are filled in later), and
- * make that the first chunk in the chain.
+ * Construct record header (prev-link is filled in later, after reserving
+ * the space for the record), and make that the first chunk in the chain.
+ *
+ * The CRC calculated for the header here doesn't include prev-link,
+ * because we don't know it yet. It will be added later.
*/
rechdr->xl_xid = GetCurrentTransactionIdIfAny();
rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
rechdr->xl_len = len; /* doesn't include backup blocks */
rechdr->xl_info = info;
rechdr->xl_rmid = rmid;
+ rechdr->xl_prev = InvalidXLogRecPtr;
+ COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
hdr_rdt.next = rdata;
hdr_rdt.data = (char *) rechdr;
hdr_rdt.len = SizeOfXLogRecord;
-
write_len += SizeOfXLogRecord;
+ /*----------
+ *
+ * We have now done all the preparatory work we can without holding a
+ * lock or modifying shared state. From here on, inserting the new WAL
+ * record to the shared WAL buffer cache is a two-step process:
+ *
+ * 1. Reserve the right amount of space from the WAL. The current head of
+ * reserved space is kept in Insert->CurrBytePos, and is protected by
+ * insertpos_lck.
+ *
+ * 2. Copy the record to the reserved WAL space. This involves finding the
+ * correct WAL buffer containing the reserved space, and copying the
+ * record in place. This can be done concurrently in multiple processes.
+ *
+ * To keep track of which insertions are still in-progress, each concurrent
+ * inserter acquires an insertion lock. In addition to just indicating that
+ * an insertion is in progress, the lock tells others how far the inserter
+ * has progressed. There is a small fixed number of insertion locks,
+ * determined by the num_xloginsert_locks GUC. When an inserter crosses a
+ * page boundary, it updates the value stored in the lock to the how far it
+ * has inserted, to allow the previous buffer to be flushed.
+ *
+ * Holding onto an insertion lock also protects RedoRecPtr and
+ * fullPageWrites from changing until the insertion is finished.
+ *
+ * Step 2 can usually be done completely in parallel. If the required WAL
+ * page is not initialized yet, you have to grab WALBufMappingLock to
+ * initialize it, but the WAL writer tries to do that ahead of insertions
+ * to avoid that from happening in the critical path.
+ *
+ *----------
+ */
START_CRIT_SECTION();
-
- /* Now wait to get insert lock */
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ if (isLogSwitch)
+ WALInsertLockAcquireExclusive();
+ else
+ WALInsertLockAcquire();
/*
* Check to see if my RedoRecPtr is out of date. If so, may have to go
* Oops, this buffer now needs to be backed up, but we
* didn't think so above. Start over.
*/
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
END_CRIT_SECTION();
rdt_lastnormal->next = NULL;
info = info_orig;
if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
{
/* Oops, must redo it with full-page data. */
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
END_CRIT_SECTION();
rdt_lastnormal->next = NULL;
info = info_orig;
}
/*
- * If the current page is completely full, the record goes to the next
- * page, right after the page header.
+ * Reserve space for the record in the WAL. This also sets the xl_prev
+ * pointer.
*/
- updrqst = false;
- freespace = INSERT_FREESPACE(Insert);
- if (freespace == 0)
+ if (isLogSwitch)
+ inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
+ else
{
- updrqst = AdvanceXLInsertBuffer(false);
- freespace = INSERT_FREESPACE(Insert);
+ ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
+ &rechdr->xl_prev);
+ inserted = true;
}
- /* Compute record's XLOG location */
- curridx = Insert->curridx;
- INSERT_RECPTR(RecPtr, Insert, curridx);
+ if (inserted)
+ {
+ /*
+ * Now that xl_prev has been filled in, finish CRC calculation of the
+ * record header.
+ */
+ COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
+ FIN_CRC32(rdata_crc);
+ rechdr->xl_crc = rdata_crc;
+
+ /*
+ * All the record data, including the header, is now ready to be
+ * inserted. Copy the record in the space reserved.
+ */
+ CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
+ }
+ else
+ {
+ /*
+ * This was an xlog-switch record, but the current insert location was
+ * already exactly at the beginning of a segment, so there was no need
+ * to do anything.
+ */
+ }
/*
- * If the record is an XLOG_SWITCH, and we are exactly at the start of a
- * segment, we need not insert it (and don't want to because we'd like
- * consecutive switch requests to be no-ops). Instead, make sure
- * everything is written and flushed through the end of the prior segment,
- * and return the prior segment's end address.
+ * Done! Let others know that we're finished.
*/
- if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
- {
- /* We can release insert lock immediately */
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
- RecPtr -= SizeOfXLogLongPHD;
+ MarkCurrentTransactionIdLoggedIfAny();
- LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
- LogwrtResult = XLogCtl->LogwrtResult;
- if (LogwrtResult.Flush < RecPtr)
- {
- XLogwrtRqst FlushRqst;
-
- FlushRqst.Write = RecPtr;
- FlushRqst.Flush = RecPtr;
- XLogWrite(FlushRqst, false, false);
- }
- LWLockRelease(WALWriteLock);
+ END_CRIT_SECTION();
- END_CRIT_SECTION();
+ /*
+ * Update shared LogwrtRqst.Write, if we crossed page boundary.
+ */
+ if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
- /* wake up walsenders now that we've released heavily contended locks */
- WalSndWakeupProcessRequests();
- return RecPtr;
+ SpinLockAcquire(&xlogctl->info_lck);
+ /* advance global request to include new block(s) */
+ if (xlogctl->LogwrtRqst.Write < EndPos)
+ xlogctl->LogwrtRqst.Write = EndPos;
+ /* update local result copy while I have the chance */
+ LogwrtResult = xlogctl->LogwrtResult;
+ SpinLockRelease(&xlogctl->info_lck);
}
- /* Finish the record header */
- rechdr->xl_prev = Insert->PrevRecord;
+ /*
+ * If this was an XLOG_SWITCH record, flush the record and the empty
+ * padding space that fills the rest of the segment, and perform
+ * end-of-segment actions (eg, notifying archiver).
+ */
+ if (isLogSwitch)
+ {
+ TRACE_POSTGRESQL_XLOG_SWITCH();
+ XLogFlush(EndPos);
- /* Now we can finish computing the record's CRC */
- COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
- FIN_CRC32(rdata_crc);
- rechdr->xl_crc = rdata_crc;
+ /*
+ * Even though we reserved the rest of the segment for us, which is
+ * reflected in EndPos, we return a pointer to just the end of the
+ * xlog-switch record.
+ */
+ if (inserted)
+ {
+ EndPos = StartPos + SizeOfXLogRecord;
+ if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+ {
+ if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
+ EndPos += SizeOfXLogLongPHD;
+ else
+ EndPos += SizeOfXLogShortPHD;
+ }
+ }
+ }
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
initStringInfo(&buf);
appendStringInfo(&buf, "INSERT @ %X/%X: ",
- (uint32) (RecPtr >> 32), (uint32) RecPtr);
+ (uint32) (EndPos >> 32), (uint32) EndPos);
xlog_outrec(&buf, rechdr);
if (rdata->data != NULL)
{
- appendStringInfo(&buf, " - ");
- RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
+ StringInfoData recordbuf;
+
+ /*
+ * We have to piece together the WAL record data from the
+ * XLogRecData entries, so that we can pass it to the rm_desc
+ * function as one contiguous chunk. (but we can leave out any
+ * extra entries we created for backup blocks)
+ */
+ rdt_lastnormal->next = NULL;
+
+ initStringInfo(&recordbuf);
+ appendBinaryStringInfo(&recordbuf, (char *) rechdr, sizeof(XLogRecord));
+ for (; rdata != NULL; rdata = rdata->next)
+ appendBinaryStringInfo(&recordbuf, rdata->data, rdata->len);
+
+ appendStringInfoString(&buf, " - ");
+ RmgrTable[rechdr->xl_rmid].rm_desc(&buf, (XLogRecord *) recordbuf.data);
+ pfree(recordbuf.data);
}
elog(LOG, "%s", buf.data);
pfree(buf.data);
}
#endif
- /* Record begin of record in appropriate places */
- ProcLastRecPtr = RecPtr;
- Insert->PrevRecord = RecPtr;
+ /*
+ * Update our global variables
+ */
+ ProcLastRecPtr = StartPos;
+ XactLastRecEnd = EndPos;
+
+ return EndPos;
+}
+
+/*
+ * Reserves the right amount of space for a record of given size from the WAL.
+ * *StartPos is set to the beginning of the reserved section, *EndPos to
+ * its end+1. *PrevPtr is set to the beginning of the previous record; it is
+ * used to set the xl_prev of this record.
+ *
+ * This is the performance critical part of XLogInsert that must be serialized
+ * across backends. The rest can happen mostly in parallel. Try to keep this
+ * section as short as possible, insertpos_lck can be heavily contended on a
+ * busy system.
+ *
+ * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
+ * where we actually copy the record to the reserved space.
+ */
+static void
+ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+ XLogRecPtr *PrevPtr)
+{
+ volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 startbytepos;
+ uint64 endbytepos;
+ uint64 prevbytepos;
+
+ size = MAXALIGN(size);
+
+ /* All (non xlog-switch) records should contain data. */
+ Assert(size > SizeOfXLogRecord);
/*
- * Append the data, including backup blocks if any
+ * The duration the spinlock needs to be held is minimized by minimizing
+ * the calculations that have to be done while holding the lock. The
+ * current tip of reserved WAL is kept in CurrBytePos, as a byte position
+ * that only counts "usable" bytes in WAL, that is, it excludes all WAL
+ * page headers. The mapping between "usable" byte positions and physical
+ * positions (XLogRecPtrs) can be done outside the locked region, and
+ * because the usable byte position doesn't include any headers, reserving
+ * X bytes from WAL is almost as simple as "CurrBytePos += X".
*/
- rdata = &hdr_rdt;
- while (write_len)
+ SpinLockAcquire(&Insert->insertpos_lck);
+
+ startbytepos = Insert->CurrBytePos;
+ endbytepos = startbytepos + size;
+ prevbytepos = Insert->PrevBytePos;
+ Insert->CurrBytePos = endbytepos;
+ Insert->PrevBytePos = startbytepos;
+
+ SpinLockRelease(&Insert->insertpos_lck);
+
+ *StartPos = XLogBytePosToRecPtr(startbytepos);
+ *EndPos = XLogBytePosToEndRecPtr(endbytepos);
+ *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+ /*
+ * Check that the conversions between "usable byte positions" and
+ * XLogRecPtrs work consistently in both directions.
+ */
+ Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+ Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+ Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+}
+
+/*
+ * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
+ *
+ * A log-switch record is handled slightly differently. The rest of the
+ * segment will be reserved for this insertion, as indicated by the returned
+ * *EndPos value. However, if we are already at the beginning of the current
+ * segment, *StartPos and *EndPos are set to the current location without
+ * reserving any space, and the function returns false.
+*/
+static bool
+ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
+{
+ volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 startbytepos;
+ uint64 endbytepos;
+ uint64 prevbytepos;
+ uint32 size = SizeOfXLogRecord;
+ XLogRecPtr ptr;
+ uint32 segleft;
+
+ /*
+ * These calculations are a bit heavy-weight to be done while holding a
+ * spinlock, but since we're holding all the WAL insertion locks, there
+ * are no other inserters competing for it. GetXLogInsertRecPtr() does
+ * compete for it, but that's not called very frequently.
+ */
+ SpinLockAcquire(&Insert->insertpos_lck);
+
+ startbytepos = Insert->CurrBytePos;
+
+ ptr = XLogBytePosToEndRecPtr(startbytepos);
+ if (ptr % XLOG_SEG_SIZE == 0)
+ {
+ SpinLockRelease(&Insert->insertpos_lck);
+ *EndPos = *StartPos = ptr;
+ return false;
+ }
+
+ endbytepos = startbytepos + size;
+ prevbytepos = Insert->PrevBytePos;
+
+ *StartPos = XLogBytePosToRecPtr(startbytepos);
+ *EndPos = XLogBytePosToEndRecPtr(endbytepos);
+
+ segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
+ if (segleft != XLOG_SEG_SIZE)
+ {
+ /* consume the rest of the segment */
+ *EndPos += segleft;
+ endbytepos = XLogRecPtrToBytePos(*EndPos);
+ }
+ Insert->CurrBytePos = endbytepos;
+ Insert->PrevBytePos = startbytepos;
+
+ SpinLockRelease(&Insert->insertpos_lck);
+
+ *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+ Assert((*EndPos) % XLOG_SEG_SIZE == 0);
+ Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+ Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+ Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+
+ return true;
+}
+
+/*
+ * Subroutine of XLogInsert. Copies a WAL record to an already-reserved
+ * area in the WAL.
+ */
+static void
+CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
+ XLogRecPtr StartPos, XLogRecPtr EndPos)
+{
+ char *currpos;
+ int freespace;
+ int written;
+ XLogRecPtr CurrPos;
+ XLogPageHeader pagehdr;
+
+ /* The first chunk is the record header */
+ Assert(rdata->len == SizeOfXLogRecord);
+
+ /*
+ * Get a pointer to the right place in the right WAL buffer to start
+ * inserting to.
+ */
+ CurrPos = StartPos;
+ currpos = GetXLogBuffer(CurrPos);
+ freespace = INSERT_FREESPACE(CurrPos);
+
+ /*
+ * there should be enough space for at least the first field (xl_tot_len)
+ * on this page.
+ */
+ Assert(freespace >= sizeof(uint32));
+
+ /* Copy record data */
+ written = 0;
+ while (rdata != NULL)
{
- while (rdata->data == NULL)
- rdata = rdata->next;
+ char *rdata_data = rdata->data;
+ int rdata_len = rdata->len;
- if (freespace > 0)
+ while (rdata_len > freespace)
{
- if (rdata->len > freespace)
+ /*
+ * Write what fits on this page, and continue on the next page.
+ */
+ Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
+ memcpy(currpos, rdata_data, freespace);
+ rdata_data += freespace;
+ rdata_len -= freespace;
+ written += freespace;
+ CurrPos += freespace;
+
+ /*
+ * Get pointer to beginning of next page, and set the xlp_rem_len
+ * in the page header. Set XLP_FIRST_IS_CONTRECORD.
+ *
+ * It's safe to set the contrecord flag and xlp_rem_len without a
+ * lock on the page. All the other flags were already set when the
+ * page was initialized, in AdvanceXLInsertBuffer, and we're the
+ * only backend that needs to set the contrecord flag.
+ */
+ currpos = GetXLogBuffer(CurrPos);
+ pagehdr = (XLogPageHeader) currpos;
+ pagehdr->xlp_rem_len = write_len - written;
+ pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
+
+ /* skip over the page header */
+ if (CurrPos % XLogSegSize == 0)
{
- memcpy(Insert->currpos, rdata->data, freespace);
- rdata->data += freespace;
- rdata->len -= freespace;
- write_len -= freespace;
+ CurrPos += SizeOfXLogLongPHD;
+ currpos += SizeOfXLogLongPHD;
}
else
{
- memcpy(Insert->currpos, rdata->data, rdata->len);
- freespace -= rdata->len;
- write_len -= rdata->len;
- Insert->currpos += rdata->len;
- rdata = rdata->next;
- continue;
+ CurrPos += SizeOfXLogShortPHD;
+ currpos += SizeOfXLogShortPHD;
}
+ freespace = INSERT_FREESPACE(CurrPos);
}
- /* Use next buffer */
- updrqst = AdvanceXLInsertBuffer(false);
- curridx = Insert->curridx;
- /* Mark page header to indicate this record continues on the page */
- Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
- Insert->currpage->xlp_rem_len = write_len;
- freespace = INSERT_FREESPACE(Insert);
+ Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
+ memcpy(currpos, rdata_data, rdata_len);
+ currpos += rdata_len;
+ CurrPos += rdata_len;
+ freespace -= rdata_len;
+ written += rdata_len;
+
+ rdata = rdata->next;
+ }
+ Assert(written == write_len);
+
+ /* Align the end position, so that the next record starts aligned */
+ CurrPos = MAXALIGN64(CurrPos);
+
+ /*
+ * If this was an xlog-switch, it's not enough to write the switch record,
+ * we also have to consume all the remaining space in the WAL segment. We
+ * have already reserved it for us, but we still need to make sure it's
+ * allocated and zeroed in the WAL buffers so that when the caller (or
+ * someone else) does XLogWrite(), it can really write out all the zeros.
+ */
+ if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
+ {
+ /* An xlog-switch record doesn't contain any data besides the header */
+ Assert(write_len == SizeOfXLogRecord);
+
+ /*
+ * We do this one page at a time, to make sure we don't deadlock
+ * against ourselves if wal_buffers < XLOG_SEG_SIZE.
+ */
+ Assert(EndPos % XLogSegSize == 0);
+
+ /* Use up all the remaining space on the first page */
+ CurrPos += freespace;
+
+ while (CurrPos < EndPos)
+ {
+ /* initialize the next page (if not initialized already) */
+ WALInsertLockUpdateInsertingAt(CurrPos);
+ AdvanceXLInsertBuffer(CurrPos, false);
+ CurrPos += XLOG_BLCKSZ;
+ }
}
- /* Ensure next record will be properly aligned */
- Insert->currpos = (char *) Insert->currpage +
- MAXALIGN(Insert->currpos - (char *) Insert->currpage);
- freespace = INSERT_FREESPACE(Insert);
+ if (CurrPos != EndPos)
+ elog(PANIC, "space reserved for WAL record does not match what was written");
+}
+
+/*
+ * Acquire a WAL insertion lock, for inserting to WAL.
+ */
+static void
+WALInsertLockAcquire(void)
+{
+ bool immed;
/*
- * The recptr I return is the beginning of the *next* record. This will be
- * stored as LSN for changed data pages...
+ * It doesn't matter which of the WAL insertion locks we acquire, so try
+ * the one we used last time. If the system isn't particularly busy, it's
+ * a good bet that it's still available, and it's good to have some
+ * affinity to a particular lock so that you don't unnecessarily bounce
+ * cache lines between processes when there's no contention.
+ *
+ * If this is the first time through in this backend, pick a lock
+ * (semi-)randomly. This allows the locks to be used evenly if you have a
+ * lot of very short connections.
*/
- INSERT_RECPTR(RecPtr, Insert, curridx);
+ static int lockToTry = -1;
+
+ if (lockToTry == -1)
+ lockToTry = MyProc->pgprocno % num_xloginsert_locks;
+ MyLockNo = lockToTry;
/*
- * If the record is an XLOG_SWITCH, we must now write and flush all the
- * existing data, and then forcibly advance to the start of the next
- * segment. It's not good to do this I/O while holding the insert lock,
- * but there seems too much risk of confusion if we try to release the
- * lock sooner. Fortunately xlog switch needn't be a high-performance
- * operation anyway...
+ * The insertingAt value is initially set to 0, as we don't know our
+ * insert location yet.
*/
- if (isLogSwitch)
+ immed = LWLockAcquireWithVar(&WALInsertLocks[MyLockNo].l.lock,
+ &WALInsertLocks[MyLockNo].l.insertingAt,
+ 0);
+ if (!immed)
{
- XLogwrtRqst FlushRqst;
- XLogRecPtr OldSegEnd;
+ /*
+ * If we couldn't get the lock immediately, try another lock next
+ * time. On a system with more insertion locks than concurrent
+ * inserters, this causes all the inserters to eventually migrate to a
+ * lock that no-one else is using. On a system with more inserters
+ * than locks, it still helps to distribute the inserters evenly
+ * across the locks.
+ */
+ lockToTry = (lockToTry + 1) % num_xloginsert_locks;
+ }
+}
- TRACE_POSTGRESQL_XLOG_SWITCH();
+/*
+ * Acquire all WAL insertion locks, to prevent other backends from inserting
+ * to WAL.
+ */
+static void
+WALInsertLockAcquireExclusive(void)
+{
+ int i;
+
+ /*
+ * When holding all the locks, we only update the last lock's insertingAt
+ * indicator. The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
+ * than any real XLogRecPtr value, to make sure that no-one blocks waiting
+ * on those.
+ */
+ for (i = 0; i < num_xloginsert_locks - 1; i++)
+ {
+ LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ UINT64CONST(0xFFFFFFFFFFFFFFFF));
+ }
+ LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ 0);
- LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+ holdingAllLocks = true;
+}
+
+/*
+ * Release our insertion lock (or locks, if we're holding them all).
+ */
+static void
+WALInsertLockRelease(void)
+{
+ if (holdingAllLocks)
+ {
+ int i;
+
+ for (i = 0; i < num_xloginsert_locks; i++)
+ LWLockRelease(&WALInsertLocks[i].l.lock);
+
+ holdingAllLocks = false;
+ }
+ else
+ {
+ LWLockRelease(&WALInsertLocks[MyLockNo].l.lock);
+ }
+}
+/*
+ * Update our insertingAt value, to let others know that we've finished
+ * inserting up to that point.
+ */
+static void
+WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
+{
+ if (holdingAllLocks)
+ {
/*
- * Flush through the end of the page containing XLOG_SWITCH, and
- * perform end-of-segment actions (eg, notifying archiver).
+ * We use the last lock to mark our actual position, see comments in
+ * WALInsertLockAcquireExclusive.
*/
- WriteRqst = XLogCtl->xlblocks[curridx];
- FlushRqst.Write = WriteRqst;
- FlushRqst.Flush = WriteRqst;
- XLogWrite(FlushRqst, false, true);
+ LWLockUpdateVar(&WALInsertLocks[num_xloginsert_locks - 1].l.lock,
+ &WALInsertLocks[num_xloginsert_locks - 1].l.insertingAt,
+ insertingAt);
+ }
+ else
+ LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
+ &WALInsertLocks[MyLockNo].l.insertingAt,
+ insertingAt);
+}
+
+/*
+ * Wait for any WAL insertions < upto to finish.
+ *
+ * Returns the location of the oldest insertion that is still in-progress.
+ * Any WAL prior to that point has been fully copied into WAL buffers, and
+ * can be flushed out to disk. Because this waits for any insertions older
+ * than 'upto' to finish, the return value is always >= 'upto'.
+ *
+ * Note: When you are about to write out WAL, you must call this function
+ * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
+ * need to wait for an insertion to finish (or at least advance to next
+ * uninitialized page), and the inserter might need to evict an old WAL buffer
+ * to make room for a new one, which in turn requires WALWriteLock.
+ */
+static XLogRecPtr
+WaitXLogInsertionsToFinish(XLogRecPtr upto)
+{
+ uint64 bytepos;
+ XLogRecPtr reservedUpto;
+ XLogRecPtr finishedUpto;
+ volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
+ int i;
+
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ /* Read the current insert position */
+ SpinLockAcquire(&Insert->insertpos_lck);
+ bytepos = Insert->CurrBytePos;
+ SpinLockRelease(&Insert->insertpos_lck);
+ reservedUpto = XLogBytePosToEndRecPtr(bytepos);
+
+ /*
+ * No-one should request to flush a piece of WAL that hasn't even been
+ * reserved yet. However, it can happen if there is a block with a bogus
+ * LSN on disk, for example. XLogFlush checks for that situation and
+ * complains, but only after the flush. Here we just assume that to mean
+ * that all WAL that has been reserved needs to be finished. In this
+ * corner-case, the return value can be smaller than 'upto' argument.
+ */
+ if (upto > reservedUpto)
+ {
+ elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
+ (uint32) (upto >> 32), (uint32) upto,
+ (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
+ upto = reservedUpto;
+ }
+
+ /*
+ * Loop through all the locks, sleeping on any in-progress insert older
+ * than 'upto'.
+ *
+ * finishedUpto is our return value, indicating the point upto which all
+ * the WAL insertions have been finished. Initialize it to the head of
+ * reserved WAL, and as we iterate through the insertion locks, back it
+ * out for any insertion that's still in progress.
+ */
+ finishedUpto = reservedUpto;
+ for (i = 0; i < num_xloginsert_locks; i++)
+ {
+ XLogRecPtr insertingat = InvalidXLogRecPtr;
+
+ do
+ {
+ /*
+ * See if this insertion is in progress. LWLockWait will wait for
+ * the lock to be released, or for the 'value' to be set by a
+ * LWLockUpdateVar call. When a lock is initially acquired, its
+ * value is 0 (InvalidXLogRecPtr), which means that we don't know
+ * where it's inserting yet. We will have to wait for it. If
+ * it's a small insertion, the record will most likely fit on the
+ * same page and the inserter will release the lock without ever
+ * calling LWLockUpdateVar. But if it has to sleep, it will
+ * advertise the insertion point with LWLockUpdateVar before
+ * sleeping.
+ */
+ if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ insertingat, &insertingat))
+ {
+ /* the lock was free, so no insertion in progress */
+ insertingat = InvalidXLogRecPtr;
+ break;
+ }
+
+ /*
+ * This insertion is still in progress. Have to wait, unless the
+ * inserter has proceeded past 'upto'.
+ */
+ } while (insertingat < upto);
+
+ if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
+ finishedUpto = insertingat;
+ }
+ return finishedUpto;
+}
+
+/*
+ * Get a pointer to the right location in the WAL buffer containing the
+ * given XLogRecPtr.
+ *
+ * If the page is not initialized yet, it is initialized. That might require
+ * evicting an old dirty buffer from the buffer cache, which means I/O.
+ *
+ * The caller must ensure that the page containing the requested location
+ * isn't evicted yet, and won't be evicted. The way to ensure that is to
+ * hold onto a WAL insertion lock with the insertingAt position set to
+ * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
+ * to evict an old page from the buffer. (This means that once you call
+ * GetXLogBuffer() with a given 'ptr', you must not access anything before
+ * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
+ * later, because older buffers might be recycled already)
+ */
+static char *
+GetXLogBuffer(XLogRecPtr ptr)
+{
+ int idx;
+ XLogRecPtr endptr;
+ static uint64 cachedPage = 0;
+ static char *cachedPos = NULL;
+ XLogRecPtr expectedEndPtr;
+
+ /*
+ * Fast path for the common case that we need to access again the same
+ * page as last time.
+ */
+ if (ptr / XLOG_BLCKSZ == cachedPage)
+ {
+ Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+ Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+ return cachedPos + ptr % XLOG_BLCKSZ;
+ }
- /* Set up the next buffer as first page of next segment */
- /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
- (void) AdvanceXLInsertBuffer(true);
+ /*
+ * The XLog buffer cache is organized so that a page is always loaded to a
+ * particular buffer. That way we can easily calculate the buffer a given
+ * page must be loaded into, from the XLogRecPtr alone.
+ */
+ idx = XLogRecPtrToBufIdx(ptr);
- /* There should be no unwritten data */
- curridx = Insert->curridx;
- Assert(curridx == XLogCtl->Write.curridx);
+ /*
+ * See what page is loaded in the buffer at the moment. It could be the
+ * page we're looking for, or something older. It can't be anything newer
+ * - that would imply the page we're looking for has already been written
+ * out to disk and evicted, and the caller is responsible for making sure
+ * that doesn't happen.
+ *
+ * However, we don't hold a lock while we read the value. If someone has
+ * just initialized the page, it's possible that we get a "torn read" of
+ * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
+ * that case we will see a bogus value. That's ok, we'll grab the mapping
+ * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
+ * the page we're looking for. But it means that when we do this unlocked
+ * read, we might see a value that appears to be ahead of the page we're
+ * looking for. Don't PANIC on that, until we've verified the value while
+ * holding the lock.
+ */
+ expectedEndPtr = ptr;
+ expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
- /* Compute end address of old segment */
- OldSegEnd = XLogCtl->xlblocks[curridx];
- OldSegEnd -= XLOG_BLCKSZ;
+ endptr = XLogCtl->xlblocks[idx];
+ if (expectedEndPtr != endptr)
+ {
+ /*
+ * Let others know that we're finished inserting the record up to the
+ * page boundary.
+ */
+ WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
- /* Make it look like we've written and synced all of old segment */
- LogwrtResult.Write = OldSegEnd;
- LogwrtResult.Flush = OldSegEnd;
+ AdvanceXLInsertBuffer(ptr, false);
+ endptr = XLogCtl->xlblocks[idx];
+ if (expectedEndPtr != endptr)
+ elog(PANIC, "could not find WAL buffer for %X/%X",
+ (uint32) (ptr >> 32), (uint32) ptr);
+ }
+ else
+ {
/*
- * Update shared-memory status --- this code should match XLogWrite
+ * Make sure the initialization of the page is visible to us, and
+ * won't arrive later to overwrite the WAL data we write on the page.
*/
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
+ pg_memory_barrier();
+ }
- SpinLockAcquire(&xlogctl->info_lck);
- xlogctl->LogwrtResult = LogwrtResult;
- if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
- xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
- if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
- xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
- SpinLockRelease(&xlogctl->info_lck);
- }
+ /*
+ * Found the buffer holding this page. Return a pointer to the right
+ * offset within the page.
+ */
+ cachedPage = ptr / XLOG_BLCKSZ;
+ cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
- LWLockRelease(WALWriteLock);
+ Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+ Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+
+ return cachedPos + ptr % XLOG_BLCKSZ;
+}
+
+/*
+ * Converts a "usable byte position" to XLogRecPtr. A usable byte position
+ * is the position starting from the beginning of WAL, excluding all WAL
+ * page headers.
+ */
+static XLogRecPtr
+XLogBytePosToRecPtr(uint64 bytepos)
+{
+ uint64 fullsegs;
+ uint64 fullpages;
+ uint64 bytesleft;
+ uint32 seg_offset;
+ XLogRecPtr result;
+
+ fullsegs = bytepos / UsableBytesInSegment;
+ bytesleft = bytepos % UsableBytesInSegment;
- updrqst = false; /* done already */
+ if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+ {
+ /* fits on first page of segment */
+ seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
- /* normal case, ie not xlog switch */
+ /* account for the first page on segment with long header */
+ seg_offset = XLOG_BLCKSZ;
+ bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+ fullpages = bytesleft / UsableBytesInPage;
+ bytesleft = bytesleft % UsableBytesInPage;
+
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ }
+
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
- /* Need to update shared LogwrtRqst if some block was filled up */
- if (freespace == 0)
+ return result;
+}
+
+/*
+ * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
+ * returns a pointer to the beginning of the page (ie. before page header),
+ * not to where the first xlog record on that page would go to. This is used
+ * when converting a pointer to the end of a record.
+ */
+static XLogRecPtr
+XLogBytePosToEndRecPtr(uint64 bytepos)
+{
+ uint64 fullsegs;
+ uint64 fullpages;
+ uint64 bytesleft;
+ uint32 seg_offset;
+ XLogRecPtr result;
+
+ fullsegs = bytepos / UsableBytesInSegment;
+ bytesleft = bytepos % UsableBytesInSegment;
+
+ if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+ {
+ /* fits on first page of segment */
+ if (bytesleft == 0)
+ seg_offset = 0;
+ else
+ seg_offset = bytesleft + SizeOfXLogLongPHD;
+ }
+ else
+ {
+ /* account for the first page on segment with long header */
+ seg_offset = XLOG_BLCKSZ;
+ bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+ fullpages = bytesleft / UsableBytesInPage;
+ bytesleft = bytesleft % UsableBytesInPage;
+
+ if (bytesleft == 0)
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
+ else
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ }
+
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
+
+ return result;
+}
+
+/*
+ * Convert an XLogRecPtr to a "usable byte position".
+ */
+static uint64
+XLogRecPtrToBytePos(XLogRecPtr ptr)
+{
+ uint64 fullsegs;
+ uint32 fullpages;
+ uint32 offset;
+ uint64 result;
+
+ XLByteToSeg(ptr, fullsegs);
+
+ fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
+ offset = ptr % XLOG_BLCKSZ;
+
+ if (fullpages == 0)
+ {
+ result = fullsegs * UsableBytesInSegment;
+ if (offset > 0)
{
- /* curridx is filled and available for writing out */
- updrqst = true;
+ Assert(offset >= SizeOfXLogLongPHD);
+ result += offset - SizeOfXLogLongPHD;
}
- else
+ }
+ else
+ {
+ result = fullsegs * UsableBytesInSegment +
+ (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
+ (fullpages - 1) * UsableBytesInPage; /* full pages */
+ if (offset > 0)
{
- /* if updrqst already set, write through end of previous buf */
- curridx = PrevBufIdx(curridx);
+ Assert(offset >= SizeOfXLogShortPHD);
+ result += offset - SizeOfXLogShortPHD;
}
- WriteRqst = XLogCtl->xlblocks[curridx];
}
- LWLockRelease(WALInsertLock);
-
- if (updrqst)
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
+ return result;
+}
- SpinLockAcquire(&xlogctl->info_lck);
- /* advance global request to include new block(s) */
- if (xlogctl->LogwrtRqst.Write < WriteRqst)
- xlogctl->LogwrtRqst.Write = WriteRqst;
- /* update local result copy while I have the chance */
- LogwrtResult = xlogctl->LogwrtResult;
- SpinLockRelease(&xlogctl->info_lck);
- }
+/*
+ * Determine whether the buffer referenced has to be backed up.
+ *
+ * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
+ * could change later, so the result should be used for optimization purposes
+ * only.
+ */
+bool
+XLogCheckBufferNeedsBackup(Buffer buffer)
+{
+ bool doPageWrites;
+ Page page;
- XactLastRecEnd = RecPtr;
+ page = BufferGetPage(buffer);
- END_CRIT_SECTION();
+ doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
- /* wake up walsenders now that we've released heavily contended locks */
- WalSndWakeupProcessRequests();
+ if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+ return true; /* buffer requires backup */
- return RecPtr;
+ return false; /* buffer does not need to be backed up */
}
/*
* save the buffer's LSN at *lsn.
*/
static bool
-XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
+XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
XLogRecPtr *lsn, BkpBlock *bkpb)
{
Page page;
page = BufferGetPage(rdata->buffer);
/*
- * XXX We assume page LSN is first data on *every* page that can be passed
- * to XLogInsert, whether it otherwise has the standard page layout or
- * not. We don't need the buffer header lock for PageGetLSN because we
- * have exclusive lock on the page and/or the relation.
+ * We assume page LSN is first data on *every* page that can be passed to
+ * XLogInsert, whether it has the standard page layout or not. We don't
+ * need to take the buffer header lock for PageGetLSN if we hold an
+ * exclusive lock on the page and/or the relation.
*/
- *lsn = PageGetLSN(page);
+ if (holdsExclusiveLock)
+ *lsn = PageGetLSN(page);
+ else
+ *lsn = BufferGetLSNAtomic(rdata->buffer);
- if (doPageWrites &&
- PageGetLSN(page) <= RedoRecPtr)
+ if (*lsn <= RedoRecPtr)
{
/*
* The page needs to be backed up, so set up *bkpb
}
/*
- * Advance the Insert state to the next buffer page, writing out the next
- * buffer if it still contains unwritten data.
- *
- * If new_segment is TRUE then we set up the next buffer page as the first
- * page of the next xlog segment file, possibly but not usually the next
- * consecutive file page.
- *
- * The global LogwrtRqst.Write pointer needs to be advanced to include the
- * just-filled page. If we can do this for free (without an extra lock),
- * we do so here. Otherwise the caller must do it. We return TRUE if the
- * request update still needs to be done, FALSE if we did it internally.
- *
- * Must be called with WALInsertLock held.
+ * Initialize XLOG buffers, writing out old buffers if they still contain
+ * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
+ * true, initialize as many pages as we can without having to write out
+ * unwritten data. Any new pages are initialized to zeros, with pages headers
+ * initialized properly.
*/
-static bool
-AdvanceXLInsertBuffer(bool new_segment)
+static void
+AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
- int nextidx = NextBufIdx(Insert->curridx);
- bool update_needed = true;
+ int nextidx;
XLogRecPtr OldPageRqstPtr;
XLogwrtRqst WriteRqst;
- XLogRecPtr NewPageEndPtr;
+ XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
XLogRecPtr NewPageBeginPtr;
XLogPageHeader NewPage;
+ int npages = 0;
+
+ LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
/*
- * Get ending-offset of the buffer page we need to replace (this may be
- * zero if the buffer hasn't been used yet). Fall through if it's already
- * written out.
+ * Now that we have the lock, check if someone initialized the page
+ * already.
*/
- OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
- if (LogwrtResult.Write < OldPageRqstPtr)
+ while (upto >= XLogCtl->InitializedUpTo || opportunistic)
{
- /* nope, got work to do... */
- XLogRecPtr FinishedPageRqstPtr;
-
- FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-
- /* Before waiting, get info_lck and update LogwrtResult */
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
-
- SpinLockAcquire(&xlogctl->info_lck);
- if (xlogctl->LogwrtRqst.Write < FinishedPageRqstPtr)
- xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
- LogwrtResult = xlogctl->LogwrtResult;
- SpinLockRelease(&xlogctl->info_lck);
- }
-
- update_needed = false; /* Did the shared-request update */
+ nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
/*
- * Now that we have an up-to-date LogwrtResult value, see if we still
- * need to write it or if someone else already did.
+ * Get ending-offset of the buffer page we need to replace (this may
+ * be zero if the buffer hasn't been used yet). Fall through if it's
+ * already written out.
*/
+ OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
if (LogwrtResult.Write < OldPageRqstPtr)
{
- /* Must acquire write lock */
- LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
- LogwrtResult = XLogCtl->LogwrtResult;
- if (LogwrtResult.Write >= OldPageRqstPtr)
+ /*
+ * Nope, got work to do. If we just want to pre-initialize as much
+ * as we can without flushing, give up now.
+ */
+ if (opportunistic)
+ break;
+
+ /* Before waiting, get info_lck and update LogwrtResult */
{
- /* OK, someone wrote it already */
- LWLockRelease(WALWriteLock);
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
+ xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
+ LogwrtResult = xlogctl->LogwrtResult;
+ SpinLockRelease(&xlogctl->info_lck);
}
- else
+
+ /*
+ * Now that we have an up-to-date LogwrtResult value, see if we
+ * still need to write it or if someone else already did.
+ */
+ if (LogwrtResult.Write < OldPageRqstPtr)
{
/*
- * Have to write buffers while holding insert lock. This is
- * not good, so only write as much as we absolutely must.
+ * Must acquire write lock. Release WALBufMappingLock first,
+ * to make sure that all insertions that we need to wait for
+ * can finish (up to this same position). Otherwise we risk
+ * deadlock.
*/
- TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
- WriteRqst.Write = OldPageRqstPtr;
- WriteRqst.Flush = 0;
- XLogWrite(WriteRqst, false, false);
- LWLockRelease(WALWriteLock);
- TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
+ LWLockRelease(WALBufMappingLock);
+
+ WaitXLogInsertionsToFinish(OldPageRqstPtr);
+
+ LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+
+ LogwrtResult = XLogCtl->LogwrtResult;
+ if (LogwrtResult.Write >= OldPageRqstPtr)
+ {
+ /* OK, someone wrote it already */
+ LWLockRelease(WALWriteLock);
+ }
+ else
+ {
+ /* Have to write it ourselves */
+ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
+ WriteRqst.Write = OldPageRqstPtr;
+ WriteRqst.Flush = 0;
+ XLogWrite(WriteRqst, false);
+ LWLockRelease(WALWriteLock);
+ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
+ }
+ /* Re-acquire WALBufMappingLock and retry */
+ LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+ continue;
}
}
- }
- /*
- * Now the next buffer slot is free and we can set it up to be the next
- * output page.
- */
- NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];
+ /*
+ * Now the next buffer slot is free and we can set it up to be the
+ * next output page.
+ */
+ NewPageBeginPtr = XLogCtl->InitializedUpTo;
+ NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
- if (new_segment)
- {
- /* force it to a segment start point */
- if (NewPageBeginPtr % XLogSegSize != 0)
- NewPageBeginPtr += XLogSegSize - NewPageBeginPtr % XLogSegSize;
- }
+ Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
- NewPageEndPtr = NewPageBeginPtr;
- NewPageEndPtr += XLOG_BLCKSZ;
- XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
- NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
+ NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
- Insert->curridx = nextidx;
- Insert->currpage = NewPage;
+ /*
+ * Be sure to re-zero the buffer so that bytes beyond what we've
+ * written will look like zeroes and not valid XLOG records...
+ */
+ MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
- Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
+ /*
+ * Fill the new page's header
+ */
+ NewPage ->xlp_magic = XLOG_PAGE_MAGIC;
- /*
- * Be sure to re-zero the buffer so that bytes beyond what we've written
- * will look like zeroes and not valid XLOG records...
- */
- MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
+ /* NewPage->xlp_info = 0; */ /* done by memset */
+ NewPage ->xlp_tli = ThisTimeLineID;
+ NewPage ->xlp_pageaddr = NewPageBeginPtr;
- /*
- * Fill the new page's header
- */
- NewPage ->xlp_magic = XLOG_PAGE_MAGIC;
+ /* NewPage->xlp_rem_len = 0; */ /* done by memset */
- /* NewPage->xlp_info = 0; */ /* done by memset */
- NewPage ->xlp_tli = ThisTimeLineID;
- NewPage ->xlp_pageaddr = NewPageBeginPtr;
+ /*
+ * If online backup is not in progress, mark the header to indicate
+ * that* WAL records beginning in this page have removable backup
+ * blocks. This allows the WAL archiver to know whether it is safe to
+ * compress archived WAL data by transforming full-block records into
+ * the non-full-block format. It is sufficient to record this at the
+ * page level because we force a page switch (in fact a segment
+ * switch) when starting a backup, so the flag will be off before any
+ * records can be written during the backup. At the end of a backup,
+ * the last page will be marked as all unsafe when perhaps only part
+ * is unsafe, but at worst the archiver would miss the opportunity to
+ * compress a few records.
+ */
+ if (!Insert->forcePageWrites)
+ NewPage ->xlp_info |= XLP_BKP_REMOVABLE;
- /*
- * If online backup is not in progress, mark the header to indicate that
- * WAL records beginning in this page have removable backup blocks. This
- * allows the WAL archiver to know whether it is safe to compress archived
- * WAL data by transforming full-block records into the non-full-block
- * format. It is sufficient to record this at the page level because we
- * force a page switch (in fact a segment switch) when starting a backup,
- * so the flag will be off before any records can be written during the
- * backup. At the end of a backup, the last page will be marked as all
- * unsafe when perhaps only part is unsafe, but at worst the archiver
- * would miss the opportunity to compress a few records.
- */
- if (!Insert->forcePageWrites)
- NewPage ->xlp_info |= XLP_BKP_REMOVABLE;
+ /*
+ * If first page of an XLOG segment file, make it a long header.
+ */
+ if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
+ {
+ XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
- /*
- * If first page of an XLOG segment file, make it a long header.
- */
- if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
- {
- XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
+ NewLongPage->xlp_sysid = ControlFile->system_identifier;
+ NewLongPage->xlp_seg_size = XLogSegSize;
+ NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
+ NewPage ->xlp_info |= XLP_LONG_HEADER;
+ }
+
+ /*
+ * Make sure the initialization of the page becomes visible to others
+ * before the xlblocks update. GetXLogBuffer() reads xlblocks without
+ * holding a lock.
+ */
+ pg_write_barrier();
- NewLongPage->xlp_sysid = ControlFile->system_identifier;
- NewLongPage->xlp_seg_size = XLogSegSize;
- NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
- NewPage ->xlp_info |= XLP_LONG_HEADER;
+ *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
- Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
+ XLogCtl->InitializedUpTo = NewPageEndPtr;
+
+ npages++;
}
+ LWLockRelease(WALBufMappingLock);
- return update_needed;
+#ifdef WAL_DEBUG
+ if (npages > 0)
+ {
+ elog(DEBUG1, "initialized %d pages, upto %X/%X",
+ npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
+ }
+#endif
}
/*
* This option allows us to avoid uselessly issuing multiple writes when a
* single one would do.
*
- * If xlog_switch == TRUE, we are intending an xlog segment switch, so
- * perform end-of-segment actions after writing the last page, even if
- * it's not physically the end of its segment. (NB: this will work properly
- * only if caller specifies WriteRqst == page-end and flexible == false,
- * and there is some data to write.)
- *
- * Must be called with WALWriteLock held.
+ * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
+ * must be called before grabbing the lock, to make sure the data is ready to
+ * write.
*/
static void
-XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
+XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{
- XLogCtlWrite *Write = &XLogCtl->Write;
bool ispartialpage;
bool last_iteration;
bool finishing_seg;
/*
* Within the loop, curridx is the cache block index of the page to
- * consider writing. We advance Write->curridx only after successfully
- * writing pages. (Right now, this refinement is useless since we are
- * going to PANIC if any error occurs anyway; but someday it may come in
- * useful.)
+ * consider writing. Begin at the buffer containing the next unwritten
+ * page, or last partially written page.
*/
- curridx = Write->curridx;
+ curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
while (LogwrtResult.Write < WriteRqst.Write)
{
* if we're passed a bogus WriteRqst.Write that is past the end of the
* last page that's been initialized by AdvanceXLInsertBuffer.
*/
- if (LogwrtResult.Write >= XLogCtl->xlblocks[curridx])
+ XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
+
+ if (LogwrtResult.Write >= EndPtr)
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
- (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
- (uint32) (XLogCtl->xlblocks[curridx] >> 32),
- (uint32) XLogCtl->xlblocks[curridx]);
+ (uint32) (LogwrtResult.Write >> 32),
+ (uint32) LogwrtResult.Write,
+ (uint32) (EndPtr >> 32), (uint32) EndPtr);
/* Advance LogwrtResult.Write to end of current buffer page */
- LogwrtResult.Write = XLogCtl->xlblocks[curridx];
+ LogwrtResult.Write = EndPtr;
ispartialpage = WriteRqst.Write < LogwrtResult.Write;
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
{
char *from;
Size nbytes;
+ Size nleft;
+ int written;
/* Need to seek in the file? */
if (openLogOff != startoffset)
if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not seek in log file %s to offset %u: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo),
- startoffset)));
+ errmsg("could not seek in log file %s to offset %u: %m",
+ XLogFileNameP(ThisTimeLineID, openLogSegNo),
+ startoffset)));
openLogOff = startoffset;
}
/* OK to write the page(s) */
from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
nbytes = npages * (Size) XLOG_BLCKSZ;
- errno = 0;
- if (write(openLogFile, from, nbytes) != nbytes)
+ nleft = nbytes;
+ do
{
- /* if write didn't set errno, assume no disk space */
- if (errno == 0)
- errno = ENOSPC;
- ereport(PANIC,
- (errcode_for_file_access(),
- errmsg("could not write to log file %s "
- "at offset %u, length %lu: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo),
- openLogOff, (unsigned long) nbytes)));
- }
+ errno = 0;
+ written = write(openLogFile, from, nleft);
+ if (written <= 0)
+ {
+ if (errno == EINTR)
+ continue;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file %s "
+ "at offset %u, length %zu: %m",
+ XLogFileNameP(ThisTimeLineID, openLogSegNo),
+ openLogOff, nbytes)));
+ }
+ nleft -= written;
+ from += written;
+ } while (nleft > 0);
/* Update state for write */
openLogOff += nbytes;
- Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
npages = 0;
/*
* later. Doing it here ensures that one and only one backend will
* perform this fsync.
*
- * We also do this if this is the last page written for an xlog
- * switch.
- *
* This is also the right place to notify the Archiver that the
* segment is ready to copy to archival storage, and to update the
* timer for archive_timeout, and to signal for a checkpoint if
* too many logfile segments have been used since the last
* checkpoint.
*/
- if (finishing_seg || (xlog_switch && last_iteration))
+ if (finishing_seg)
{
issue_xlog_fsync(openLogFile, openLogSegNo);
if (XLogArchivingActive())
XLogArchiveNotifySeg(openLogSegNo);
- Write->lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
/*
* Request a checkpoint if we've consumed too much xlog since
}
Assert(npages == 0);
- Assert(curridx == Write->curridx);
/*
* If asked to flush, do so
{
/*
* Could get here without iterating above loop, in which case we might
- * have no open file or the wrong one. However, we do not need to
+ * have no open file or the wrong one. However, we do not need to
* fsync more than one file.
*/
if (sync_method != SYNC_METHOD_OPEN &&
/*
* If the WALWriter is sleeping, we should kick it to make it come out of
- * low-power mode. Otherwise, determine whether there's a full page of
+ * low-power mode. Otherwise, determine whether there's a full page of
* WAL available to write.
*/
if (!sleeping)
SetLatch(ProcGlobal->walwriterLatch);
}
+/*
+ * Record the LSN up to which we can remove WAL because it's not required by
+ * any replication slot.
+ */
+void
+XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->replicationSlotMinLSN = lsn;
+ SpinLockRelease(&xlogctl->info_lck);
+}
+
+
+/*
+ * Return the oldest LSN we must retain to satisfy the needs of some
+ * replication slot.
+ */
+static XLogRecPtr
+XLogGetReplicationSlotMinimumLSN(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogRecPtr retval;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ retval = xlogctl->replicationSlotMinLSN;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return retval;
+}
+
/*
* Advance minRecoveryPoint in control file.
*
if (!force && newMinRecoveryPoint < lsn)
elog(WARNING,
"xlog min recovery request %X/%X is past current point %X/%X",
- (uint32) (lsn >> 32) , (uint32) lsn,
+ (uint32) (lsn >> 32), (uint32) lsn,
(uint32) (newMinRecoveryPoint >> 32),
(uint32) newMinRecoveryPoint);
minRecoveryPointTLI = newMinRecoveryPointTLI;
ereport(DEBUG2,
- (errmsg("updated min recovery point to %X/%X on timeline %u",
- (uint32) (minRecoveryPoint >> 32),
- (uint32) minRecoveryPoint,
- newMinRecoveryPointTLI)));
+ (errmsg("updated min recovery point to %X/%X on timeline %u",
+ (uint32) (minRecoveryPoint >> 32),
+ (uint32) minRecoveryPoint,
+ newMinRecoveryPointTLI)));
}
}
LWLockRelease(ControlFileLock);
elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
(uint32) (record >> 32), (uint32) record,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
START_CRIT_SECTION();
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogRecPtr insertpos;
/* read LogwrtResult and update local state */
SpinLockAcquire(&xlogctl->info_lck);
if (record <= LogwrtResult.Flush)
break;
+ /*
+ * Before actually performing the write, wait for all in-flight
+ * insertions to the pages we're about to write to finish.
+ */
+ insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
+
/*
* Try to get the write lock. If we can't get it immediately, wait
* until it's released, and recheck if we still need to do the flush
/*
* Sleep before flush! By adding a delay here, we may give further
* backends the opportunity to join the backlog of group commit
- * followers; this can significantly improve transaction throughput, at
- * the risk of increasing transaction latency.
+ * followers; this can significantly improve transaction throughput,
+ * at the risk of increasing transaction latency.
*
* We do not sleep if enableFsync is not turned on, nor if there are
* fewer than CommitSiblings other backends with active transactions.
*/
if (CommitDelay > 0 && enableFsync &&
MinimumActiveBackends(CommitSiblings))
+ {
pg_usleep(CommitDelay);
+ /*
+ * Re-check how far we can now flush the WAL. It's generally not
+ * safe to call WaitXLogInsetionsToFinish while holding
+ * WALWriteLock, because an in-progress insertion might need to
+ * also grab WALWriteLock to make progress. But we know that all
+ * the insertions up to insertpos have already finished, because
+ * that's what the earlier WaitXLogInsertionsToFinish() returned.
+ * We're only calling it again to allow insertpos to be moved
+ * further forward, not to actually wait for anyone.
+ */
+ insertpos = WaitXLogInsertionsToFinish(insertpos);
+ }
+
/* try to write/flush later additions to XLOG as well */
- if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
- {
- XLogCtlInsert *Insert = &XLogCtl->Insert;
- uint32 freespace = INSERT_FREESPACE(Insert);
+ WriteRqst.Write = insertpos;
+ WriteRqst.Flush = insertpos;
- if (freespace == 0) /* buffer is full */
- WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
- else
- {
- WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
- WriteRqstPtr -= freespace;
- }
- LWLockRelease(WALInsertLock);
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = WriteRqstPtr;
- }
- else
- {
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = record;
- }
- XLogWrite(WriteRqst, false, false);
+ XLogWrite(WriteRqst, false);
LWLockRelease(WALWriteLock);
/* done */
elog(ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
(uint32) (record >> 32), (uint32) record,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
}
/*
* We normally flush only completed blocks; but if there is nothing to do on
* that basis, we check for unflushed async commits in the current incomplete
* block, and flush through the latest one of those. Thus, if async commits
- * are not being used, we will flush complete blocks only. We can guarantee
+ * are not being used, we will flush complete blocks only. We can guarantee
* that async commits reach disk after at most three cycles; normally only
- * one or two. (When flushing complete blocks, we allow XLogWrite to write
+ * one or two. (When flushing complete blocks, we allow XLogWrite to write
* "flexibly", meaning it can stop at the end of the buffer ring; this makes a
* difference only with very high load or long wal_writer_delay, but imposes
* one extra cycle for the worst case for async commits.)
elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
(uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
START_CRIT_SECTION();
- /* now wait for the write lock */
+ /* now wait for any in-progress insertions to finish and get write lock */
+ WaitXLogInsertionsToFinish(WriteRqstPtr);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
if (WriteRqstPtr > LogwrtResult.Flush)
WriteRqst.Write = WriteRqstPtr;
WriteRqst.Flush = WriteRqstPtr;
- XLogWrite(WriteRqst, flexible, false);
+ XLogWrite(WriteRqst, flexible);
wrote_something = true;
}
LWLockRelease(WALWriteLock);
/* wake up walsenders now that we've released heavily contended locks */
WalSndWakeupProcessRequests();
+ /*
+ * Great, done. To take some work off the critical path, try to initialize
+ * as many of the no-longer-needed WAL buffers for future use as we can.
+ */
+ AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
+
return wrote_something;
}
* log, seg: identify segment to be created/opened.
*
* *use_existent: if TRUE, OK to use a pre-existing file (else, any
- * pre-existing file will be deleted). On return, TRUE if a pre-existing
+ * pre-existing file will be deleted). On return, TRUE if a pre-existing
* file was used.
*
* use_lock: if TRUE, acquire ControlFileLock while moving file into
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
+ char zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
char *zbuffer;
XLogSegNo installed_segno;
int max_advance;
unlink(tmppath);
- /*
- * Allocate a buffer full of zeros. This is done before opening the file
- * so that we don't leak the file descriptor if palloc fails.
- *
- * Note: palloc zbuffer, instead of just using a local char array, to
- * ensure it is reasonably well-aligned; this may save a few cycles
- * transferring data to the kernel.
- */
- zbuffer = (char *) palloc0(XLOG_BLCKSZ);
-
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
errmsg("could not create file \"%s\": %m", tmppath)));
/*
- * Zero-fill the file. We have to do this the hard way to ensure that all
+ * Zero-fill the file. We have to do this the hard way to ensure that all
* the file space has really been allocated --- on platforms that allow
* "holes" in files, just seeking to the end doesn't allocate intermediate
* space. This way, we know that we have all the space and (after the
- * fsync below) that all the indirect blocks are down on disk. Therefore,
+ * fsync below) that all the indirect blocks are down on disk. Therefore,
* fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
* log file.
+ *
+ * Note: ensure the buffer is reasonably well-aligned; this may save a few
+ * cycles transferring data to the kernel.
*/
+ zbuffer = (char *) MAXALIGN(zbuffer_raw);
+ memset(zbuffer, 0, XLOG_BLCKSZ);
for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
{
errno = 0;
errmsg("could not write to file \"%s\": %m", tmppath)));
}
}
- pfree(zbuffer);
if (pg_fsync(fd) != 0)
{
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", path)));
+ errmsg("could not open file \"%s\": %m", path)));
elog(DEBUG2, "done creating and filling new WAL file");
* a different timeline)
*
* Currently this is only used during recovery, and so there are no locking
- * considerations. But we should be just as tense as XLogFileInit to avoid
+ * considerations. But we should be just as tense as XLogFileInit to avoid
* emplacing a bogus file.
*/
static void
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not open xlog file \"%s\": %m", path)));
+ errmsg("could not open transaction log file \"%s\": %m", path)));
return fd;
}
* the timelines listed in expectedTLEs.
*
* We expect curFileTLI on entry to be the TLI of the preceding file in
- * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
+ * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
* to go backwards; this prevents us from picking up the wrong file when a
* parent timeline extends to higher segment numbers than the child we
* want to read.
/*
* WAL segment files will not be re-read in normal operation, so we advise
- * the OS to release any cached pages. But do not do so if WAL archiving
+ * the OS to release any cached pages. But do not do so if WAL archiving
* or streaming is active, because archiver and walsender process could
* use the cache to read the WAL segment.
*/
}
}
+/*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogSegNo lastRemovedSegNo;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ lastRemovedSegNo = xlogctl->lastRemovedSegNo;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return lastRemovedSegNo;
+}
+
/*
* Update the last removed segno pointer in shared memory, to reflect
* that the given XLOG file has been removed.
{
/*
* We ignore the timeline part of the XLOG segment identifiers in
- * deciding whether a segment is still needed. This ensures that we
+ * deciding whether a segment is still needed. This ensures that we
* won't prematurely remove a segment from a parent timeline. We could
* probably be a little more proactive about removing segments of
* non-parent timelines, but that would be a whole lot more
RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
bool get_cleanup_lock, bool keep_buffer)
{
- Buffer buffer;
- Page page;
BkpBlock bkpb;
char *blk;
int i;
if (i == block_index)
{
/* Found it, apply the update */
- buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
- RBM_ZERO);
- Assert(BufferIsValid(buffer));
- if (get_cleanup_lock)
- LockBufferForCleanup(buffer);
- else
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
- page = (Page) BufferGetPage(buffer);
-
- if (bkpb.hole_length == 0)
- {
- memcpy((char *) page, blk, BLCKSZ);
- }
- else
- {
- memcpy((char *) page, blk, bkpb.hole_offset);
- /* must zero-fill the hole */
- MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
- memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
- blk + bkpb.hole_offset,
- BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
- }
-
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
-
- if (!keep_buffer)
- UnlockReleaseBuffer(buffer);
-
- return buffer;
+ return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
+ keep_buffer);
}
blk += BLCKSZ - bkpb.hole_length;
return InvalidBuffer; /* keep compiler quiet */
}
+/*
+ * Workhorse for RestoreBackupBlock usable without an xlog record
+ *
+ * Restores a full-page image from BkpBlock and a data pointer.
+ */
+static Buffer
+RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
+ bool get_cleanup_lock, bool keep_buffer)
+{
+ Buffer buffer;
+ Page page;
+
+ buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
+ RBM_ZERO);
+ Assert(BufferIsValid(buffer));
+ if (get_cleanup_lock)
+ LockBufferForCleanup(buffer);
+ else
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ page = (Page) BufferGetPage(buffer);
+
+ if (bkpb.hole_length == 0)
+ {
+ memcpy((char *) page, blk, BLCKSZ);
+ }
+ else
+ {
+ memcpy((char *) page, blk, bkpb.hole_offset);
+ /* must zero-fill the hole */
+ MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
+ memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
+ blk + bkpb.hole_offset,
+ BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
+ }
+
+ /*
+ * The checksum value on this page is currently invalid. We don't need to
+ * reset it here since it will be set before being written.
+ */
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+
+ if (!keep_buffer)
+ UnlockReleaseBuffer(buffer);
+
+ return buffer;
+}
+
/*
* Attempt to read an XLOG record.
*
for (;;)
{
- char *errormsg;
+ char *errormsg;
record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
ReadRecPtr = xlogreader->ReadRecPtr;
}
/*
- * We only end up here without a message when XLogPageRead() failed
- * - in that case we already logged something.
- * In StandbyMode that only happens if we have been triggered, so
- * we shouldn't loop anymore in that case.
+ * We only end up here without a message when XLogPageRead()
+ * failed - in that case we already logged something. In
+ * StandbyMode that only happens if we have been triggered, so we
+ * shouldn't loop anymore in that case.
*/
if (errormsg)
ereport(emode_for_corrupt_record(emode,
RecPtr ? RecPtr : EndRecPtr),
- (errmsg_internal("%s", errormsg) /* already translated */));
+ (errmsg_internal("%s", errormsg) /* already translated */ ));
}
+
/*
* Check page TLI is one of the expected values.
*/
else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
{
char fname[MAXFNAMELEN];
- XLogSegNo segno;
- int32 offset;
+ XLogSegNo segno;
+ int32 offset;
XLByteToSeg(xlogreader->latestPagePtr, segno);
offset = xlogreader->latestPagePtr % XLogSegSize;
XLogFileName(fname, xlogreader->readPageTLI, segno);
ereport(emode_for_corrupt_record(emode,
RecPtr ? RecPtr : EndRecPtr),
- (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
- xlogreader->latestPageTLI,
- fname,
- offset)));
+ (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+ xlogreader->latestPageTLI,
+ fname,
+ offset)));
record = NULL;
}
lastSourceFailed = true;
/*
- * If archive recovery was requested, but we were still doing crash
- * recovery, switch to archive recovery and retry using the offline
- * archive. We have now replayed all the valid WAL in pg_xlog, so
- * we are presumably now consistent.
+ * If archive recovery was requested, but we were still doing
+ * crash recovery, switch to archive recovery and retry using the
+ * offline archive. We have now replayed all the valid WAL in
+ * pg_xlog, so we are presumably now consistent.
*
* We require that there's at least some valid WAL present in
* pg_xlog, however (!fetch_ckpt). We could recover using the WAL
newExpectedTLEs = readTimeLineHistory(newtarget);
/*
- * If the current timeline is not part of the history of the new
- * timeline, we cannot proceed to it.
+ * If the current timeline is not part of the history of the new timeline,
+ * we cannot proceed to it.
*/
found = false;
- foreach (cell, newExpectedTLEs)
+ foreach(cell, newExpectedTLEs)
{
currentTle = (TimeLineHistoryEntry *) lfirst(cell);
* I/O routines for pg_control
*
* *ControlFile is a buffer in shared memory that holds an image of the
- * contents of pg_control. WriteControlFile() initializes pg_control
+ * contents of pg_control. WriteControlFile() initializes pg_control
* given a preloaded buffer, ReadControlFile() loads the buffer from
* the pg_control file (during postmaster or standalone-backend startup),
* and UpdateControlFile() rewrites pg_control after we modify xlog state.
ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
+ ControlFile->loblksize = LOBLKSIZE;
#ifdef HAVE_INT64_TIMESTAMP
ControlFile->enableIntTimes = true;
" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->loblksize != LOBLKSIZE)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with LOBLKSIZE %d,"
+ " but the server was compiled with LOBLKSIZE %d.",
+ ControlFile->loblksize, (int) LOBLKSIZE),
+ errhint("It looks like you need to recompile or initdb.")));
#ifdef HAVE_INT64_TIMESTAMP
if (ControlFile->enableIntTimes != true)
" but the server was compiled without USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#endif
+
+ /* Make the initdb settings visible as GUC variables, too */
+ SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
+ PGC_INTERNAL, PGC_S_OVERRIDE);
}
void
return ControlFile->system_identifier;
}
+/*
+ * Are checksums enabled for data pages?
+ */
+bool
+DataChecksumsEnabled(void)
+{
+ Assert(ControlFile != NULL);
+ return (ControlFile->data_checksum_version > 0);
+}
+
/*
* Returns a fake LSN for unlogged relations.
*
XLogRecPtr
GetFakeLSNForUnloggedRel(void)
{
- XLogRecPtr nextUnloggedLSN;
+ XLogRecPtr nextUnloggedLSN;
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
{
/*
* If we haven't yet changed the boot_val default of -1, just let it
- * be. We'll fix it when XLOGShmemSize is called.
+ * be. We'll fix it when XLOGShmemSize is called.
*/
if (XLOGbuffers == -1)
return true;
/* XLogCtl */
size = sizeof(XLogCtlData);
+
+ /* WAL insertion locks, plus alignment */
+ size = add_size(size, mul_size(sizeof(WALInsertLockPadded), num_xloginsert_locks + 1));
/* xlblocks array */
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
/* extra alignment padding for XLOG I/O buffers */
- size = add_size(size, ALIGNOF_XLOG_BUFFER);
+ size = add_size(size, XLOG_BLCKSZ);
/* and the buffers themselves */
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
bool foundCFile,
foundXLog;
char *allocptr;
+ int i;
ControlFile = (ControlFileData *)
ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
Assert(foundCFile && foundXLog);
return;
}
-
memset(XLogCtl, 0, sizeof(XLogCtlData));
/*
memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
+
+ /* WAL insertion locks. Ensure they're aligned to the full padded size */
+ allocptr += sizeof(WALInsertLockPadded) -
+ ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
+ WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
+ (WALInsertLockPadded *) allocptr;
+ allocptr += sizeof(WALInsertLockPadded) * num_xloginsert_locks;
+
+ XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
+
+ XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
+ XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
+ XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
+
+ LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
+ for (i = 0; i < num_xloginsert_locks; i++)
+ {
+ LWLockInitialize(&WALInsertLocks[i].l.lock,
+ XLogCtl->Insert.WALInsertLockTrancheId);
+ WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+ }
+
/*
- * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
+ * Align the start of the page buffers to a full xlog block size boundary.
+ * This simplifies some calculations in XLOG insertion. It is also
+ * required for O_DIRECT.
*/
- allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
+ allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
XLogCtl->pages = allocptr;
memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
XLogCtl->SharedRecoveryInProgress = true;
XLogCtl->SharedHotStandbyActive = false;
XLogCtl->WalWriterSleeping = false;
- XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
+
+ SpinLockInit(&XLogCtl->Insert.insertpos_lck);
SpinLockInit(&XLogCtl->info_lck);
SpinLockInit(&XLogCtl->ulsn_lck);
InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
* field, as being about as unique as we can easily get. (Think not to
* use random(), since it hasn't been seeded and there's no portable way
* to seed it other than the system clock value...) The upper half of the
- * uint64 value is just the tv_sec part, while the lower half is the XOR
- * of tv_sec and tv_usec. This is to ensure that we don't lose uniqueness
- * unnecessarily if "uint64" is really only 32 bits wide. A person
- * knowing this encoding can determine the initialization time of the
- * installation, which could perhaps be useful sometimes.
+ * uint64 value is just the tv_sec part, while the lower half contains the
+ * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
+ * PID for a little extra uniqueness. A person knowing this encoding can
+ * determine the initialization time of the installation, which could
+ * perhaps be useful sometimes.
*/
gettimeofday(&tv, NULL);
sysidentifier = ((uint64) tv.tv_sec) << 32;
- sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
+ sysidentifier |= ((uint64) tv.tv_usec) << 12;
+ sysidentifier |= getpid() & 0xFFF;
/* First timeline ID is always 1 */
ThisTimeLineID = 1;
/* page buffer must be aligned suitably for O_DIRECT */
- buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
- page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
+ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
+ page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
memset(page, 0, XLOG_BLCKSZ);
/*
/* Set important parameter values for use when replaying WAL */
ControlFile->MaxConnections = MaxConnections;
+ ControlFile->max_worker_processes = max_worker_processes;
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
+ ControlFile->wal_log_hints = wal_log_hints;
+ ControlFile->data_checksum_version = bootstrap_data_checksum_version;
/* some additional ControlFile fields are set in WriteControlFile() */
}
else if (strcmp(item->name, "recovery_target_time") == 0)
{
- /*
- * if recovery_target_xid or recovery_target_name specified, then
- * this overrides recovery_target_time
- */
- if (recoveryTarget == RECOVERY_TARGET_XID ||
- recoveryTarget == RECOVERY_TARGET_NAME)
- continue;
recoveryTarget = RECOVERY_TARGET_TIME;
/*
}
else if (strcmp(item->name, "recovery_target_name") == 0)
{
- /*
- * if recovery_target_xid specified, then this overrides
- * recovery_target_name
- */
- if (recoveryTarget == RECOVERY_TARGET_XID)
- continue;
recoveryTarget = RECOVERY_TARGET_NAME;
recoveryTargetName = pstrdup(item->value);
(errmsg_internal("recovery_target_name = '%s'",
recoveryTargetName)));
}
+ else if (strcmp(item->name, "recovery_target") == 0)
+ {
+ if (strcmp(item->value, "immediate") == 0)
+ recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid recovery_target parameter"),
+ errhint("The only allowed value is 'immediate'")));
+ ereport(DEBUG2,
+ (errmsg_internal("recovery_target = '%s'",
+ item->value)));
+ }
else if (strcmp(item->name, "recovery_target_inclusive") == 0)
{
/*
(errmsg_internal("primary_conninfo = '%s'",
PrimaryConnInfo)));
}
+ else if (strcmp(item->name, "primary_slot_name") == 0)
+ {
+ ReplicationSlotValidateName(item->value, ERROR);
+ PrimarySlotName = pstrdup(item->value);
+ ereport(DEBUG2,
+ (errmsg_internal("primary_slot_name = '%s'",
+ PrimarySlotName)));
+ }
else if (strcmp(item->name, "trigger_file") == 0)
{
TriggerFile = pstrdup(item->value);
(errmsg_internal("trigger_file = '%s'",
TriggerFile)));
}
+ else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
+ {
+ const char *hintmsg;
+
+ if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
+ &hintmsg))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("parameter \"%s\" requires a temporal value",
+ "recovery_min_apply_delay"),
+ hintmsg ? errhint("%s", _(hintmsg)) : 0));
+ ereport(DEBUG2,
+ (errmsg("recovery_min_apply_delay = '%s'", item->value)));
+ }
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
/*
* If user specified recovery_target_timeline, validate it or compute the
- * "latest" value. We can't do this until after we've gotten the restore
+ * "latest" value. We can't do this until after we've gotten the restore
* command and set InArchiveRecovery, because we need to fetch timeline
* history files from the archive.
*/
(errmsg("archive recovery complete")));
}
+/*
+ * Extract timestamp from WAL record.
+ *
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
+ */
+static bool
+getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
+{
+ uint8 record_info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+ {
+ *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+ return true;
+ }
+ if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
+ {
+ *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
+ {
+ *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
+ {
+ *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ return false;
+}
+
/*
* For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG at or after the current record.
- *
- * Returns TRUE if we are stopping, FALSE otherwise. On TRUE return,
- * *includeThis is set TRUE if we should apply this record before stopping.
+ * stop applying the XLOG before the current record.
*
- * We also track the timestamp of the latest applied COMMIT/ABORT
- * record in XLogCtl->recoveryLastXTime, for logging purposes.
- * Also, some information is saved in recoveryStopXid et al for use in
- * annotating the new timeline's history file.
+ * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
*/
static bool
-recoveryStopsHere(XLogRecord *record, bool *includeThis)
+recoveryStopsBefore(XLogRecord *record)
{
- bool stopsHere;
+ bool stopsHere = false;
uint8 record_info;
- TimestampTz recordXtime;
- char recordRPName[MAXFNAMELEN];
-
- /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
- if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
- return false;
- record_info = record->xl_info & ~XLR_INFO_MASK;
- if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
- {
- xl_xact_commit_compact *recordXactCommitData;
-
- recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
- recordXtime = recordXactCommitData->xact_time;
- }
- else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
- {
- xl_xact_commit *recordXactCommitData;
+ bool isCommit;
+ TimestampTz recordXtime = 0;
- recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
- recordXtime = recordXactCommitData->xact_time;
- }
- else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
{
- xl_xact_abort *recordXactAbortData;
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
- recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
- recordXtime = recordXactAbortData->xact_time;
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ return true;
}
- else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
- {
- xl_restore_point *recordRestorePointData;
- recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
- recordXtime = recordRestorePointData->rp_time;
- strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
- }
- else
+ /* Otherwise we only consider stopping before COMMIT or ABORT records. */
+ if (record->xl_rmid != RM_XACT_ID)
return false;
-
- /* Do we have a PITR target at all? */
- if (recoveryTarget == RECOVERY_TARGET_UNSET)
- {
- /*
- * Save timestamp of latest transaction commit/abort if this is a
- * transaction record
- */
- if (record->xl_rmid == RM_XACT_ID)
- SetLatestXTime(recordXtime);
+ record_info = record->xl_info & ~XLR_INFO_MASK;
+ if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+ isCommit = true;
+ else if (record_info == XLOG_XACT_ABORT)
+ isCommit = false;
+ else
return false;
- }
- if (recoveryTarget == RECOVERY_TARGET_XID)
+ if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
{
/*
* There can be only one transaction end record with this exact
* 50% of the time...
*/
stopsHere = (record->xl_xid == recoveryTargetXid);
- if (stopsHere)
- *includeThis = recoveryTargetInclusive;
}
- else if (recoveryTarget == RECOVERY_TARGET_NAME)
- {
- /*
- * There can be many restore points that share the same name, so we
- * stop at the first one
- */
- stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
- /*
- * Ignore recoveryTargetInclusive because this is not a transaction
- * record
- */
- *includeThis = false;
- }
- else
+ if (recoveryTarget == RECOVERY_TARGET_TIME &&
+ getRecordTimestamp(record, &recordXtime))
{
/*
* There can be many transactions that share the same commit time, so
stopsHere = (recordXtime > recoveryTargetTime);
else
stopsHere = (recordXtime >= recoveryTargetTime);
- if (stopsHere)
- *includeThis = false;
}
if (stopsHere)
{
+ recoveryStopAfter = false;
recoveryStopXid = record->xl_xid;
recoveryStopTime = recordXtime;
- recoveryStopAfter = *includeThis;
+ recoveryStopName[0] = '\0';
- if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+ if (isCommit)
{
- if (recoveryStopAfter)
- ereport(LOG,
- (errmsg("recovery stopping after commit of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- else
- ereport(LOG,
- (errmsg("recovery stopping before commit of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
+ ereport(LOG,
+ (errmsg("recovery stopping before commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
}
- else if (record_info == XLOG_XACT_ABORT)
+ else
{
- if (recoveryStopAfter)
- ereport(LOG,
- (errmsg("recovery stopping after abort of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- else
- ereport(LOG,
- (errmsg("recovery stopping before abort of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
+ ereport(LOG,
+ (errmsg("recovery stopping before abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
}
- else
+ }
+
+ return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogRecord *record)
+{
+ uint8 record_info;
+ TimestampTz recordXtime;
+
+ record_info = record->xl_info & ~XLR_INFO_MASK;
+
+ /*
+ * There can be many restore points that share the same name; we stop at
+ * the first one.
+ */
+ if (recoveryTarget == RECOVERY_TARGET_NAME &&
+ record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *recordRestorePointData;
+
+ recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+ if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
{
- strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ (void) getRecordTimestamp(record, &recoveryStopTime);
+ strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
ereport(LOG,
(errmsg("recovery stopping at restore point \"%s\", time %s",
recoveryStopName,
timestamptz_to_str(recoveryStopTime))));
+ return true;
}
+ }
+
+ if (record->xl_rmid == RM_XACT_ID &&
+ (record_info == XLOG_XACT_COMMIT_COMPACT ||
+ record_info == XLOG_XACT_COMMIT ||
+ record_info == XLOG_XACT_ABORT))
+ {
+ /* Update the last applied transaction timestamp */
+ if (getRecordTimestamp(record, &recordXtime))
+ SetLatestXTime(recordXtime);
/*
- * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
- * restore point since they are timestamped, though the latest
- * transaction time is not updated.
+ * There can be only one transaction end record with this exact
+ * transactionid
+ *
+ * when testing for an xid, we MUST test for equality only, since
+ * transactions are numbered in the order they start, not the order
+ * they complete. A higher numbered xid will complete before you about
+ * 50% of the time...
*/
- if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
- SetLatestXTime(recordXtime);
+ if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+ record->xl_xid == recoveryTargetXid)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = record->xl_xid;
+ recoveryStopTime = recordXtime;
+ recoveryStopName[0] = '\0';
+
+ if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ else if (record_info == XLOG_XACT_ABORT)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ return true;
+ }
}
- else if (record->xl_rmid == RM_XACT_ID)
- SetLatestXTime(recordXtime);
- return stopsHere;
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
+
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ return false;
}
/*
SpinLockRelease(&xlogctl->info_lck);
}
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the master.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogRecord *record)
+{
+ uint8 record_info;
+ TimestampTz xtime;
+ long secs;
+ int microsecs;
+
+ /* nothing to do if no delay configured */
+ if (recovery_min_apply_delay == 0)
+ return false;
+
+ /*
+ * Is it a COMMIT record?
+ *
+ * We deliberately choose not to delay aborts since they have no effect on
+ * MVCC. We already allow replay of records that don't have a timestamp,
+ * so there is already opportunity for issues caused by early conflicts on
+ * standbys.
+ */
+ record_info = record->xl_info & ~XLR_INFO_MASK;
+ if (!(record->xl_rmid == RM_XACT_ID &&
+ (record_info == XLOG_XACT_COMMIT_COMPACT ||
+ record_info == XLOG_XACT_COMMIT)))
+ return false;
+
+ if (!getRecordTimestamp(record, &xtime))
+ return false;
+
+ recoveryDelayUntilTime =
+ TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+ /*
+ * Exit without arming the latch if it's already past time to apply this
+ * record
+ */
+ TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+ &secs, µsecs);
+ if (secs <= 0 && microsecs <= 0)
+ return false;
+
+ while (true)
+ {
+ ResetLatch(&XLogCtl->recoveryWakeupLatch);
+
+ /* might change the trigger file's location */
+ HandleStartupProcInterrupts();
+
+ if (CheckForStandbyTrigger())
+ break;
+
+ /*
+ * Wait for difference between GetCurrentTimestamp() and
+ * recoveryDelayUntilTime
+ */
+ TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+ &secs, µsecs);
+
+ if (secs <= 0 && microsecs <= 0)
+ break;
+
+ elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
+ secs, microsecs / 1000);
+
+ WaitLatch(&XLogCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ secs * 1000L + microsecs / 1000);
+ }
+ return true;
+}
+
/*
* Save timestamp of latest processed commit/abort record.
*
* For archive recovery, the WAL must be generated with at least 'archive'
* wal_level.
*/
- if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+ if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
{
ereport(WARNING,
(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
* For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
* we must have at least as many backend slots as the primary.
*/
- if (InArchiveRecovery && EnableHotStandby)
+ if (ArchiveRecoveryRequested && EnableHotStandby)
{
if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
ereport(ERROR,
- (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
+ (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
/* We ignore autovacuum_max_workers when we make this test. */
RecoveryRequiresIntParameter("max_connections",
MaxConnections,
ControlFile->MaxConnections);
+ RecoveryRequiresIntParameter("max_worker_processes",
+ max_worker_processes,
+ ControlFile->max_worker_processes);
RecoveryRequiresIntParameter("max_prepared_transactions",
max_prepared_xacts,
ControlFile->max_prepared_xacts);
XLogSegNo endLogSegNo;
TimeLineID PrevTimeLineID;
XLogRecord *record;
- uint32 freespace;
TransactionId oldestActiveXID;
bool backupEndRequired = false;
bool backupFromStandby = false;
(errmsg("control file contains invalid data")));
if (ControlFile->state == DB_SHUTDOWNED)
- ereport(LOG,
+ {
+ /* This is the expected case, so don't be chatty in standalone mode */
+ ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("database system was shut down at %s",
str_time(ControlFile->time))));
+ }
else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
ereport(LOG,
(errmsg("database system was shut down in recovery at %s",
ValidateXLOGDirectoryStructure();
/*
- * Clear out any old relcache cache files. This is *necessary* if we do
+ * Clear out any old relcache cache files. This is *necessary* if we do
* any WAL replay, since that would probably result in the cache files
* being out of sync with database reality. In theory we could leave them
* in place if the database had been cleanly shut down, but it seems
RelationCacheInitFileRemove();
/*
- * Initialize on the assumption we want to recover to the same timeline
+ * Initialize on the assumption we want to recover to the latest timeline
* that's active according to pg_control.
*/
- recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+ if (ControlFile->minRecoveryPointTLI >
+ ControlFile->checkPointCopy.ThisTimeLineID)
+ recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
+ else
+ recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
/*
* Check for recovery control file, and if so set up state for offline
* Save archive_cleanup_command in shared memory so that other processes
* can see it.
*/
- strncpy(XLogCtl->archiveCleanupCommand,
+ strlcpy(XLogCtl->archiveCleanupCommand,
archiveCleanupCommand ? archiveCleanupCommand : "",
sizeof(XLogCtl->archiveCleanupCommand));
ereport(LOG,
(errmsg("starting point-in-time recovery to \"%s\"",
recoveryTargetName)));
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to earliest consistent point")));
else
ereport(LOG,
(errmsg("starting archive recovery")));
}
- else if (ControlFile->minRecoveryPointTLI > 0)
- {
- /*
- * If the minRecoveryPointTLI is set when not in Archive Recovery
- * it means that we have crashed after ending recovery and
- * yet before we wrote a new checkpoint on the new timeline.
- * That means we are doing a crash recovery that needs to cross
- * timelines to get to our newly assigned timeline again.
- * The timeline we are headed for is exact and not 'latest'.
- * As soon as we hit a checkpoint, the minRecoveryPointTLI is
- * reset, so we will not enter crash recovery again.
- */
- Assert(ControlFile->minRecoveryPointTLI != 1);
- recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
- recoveryTargetIsLatest = false;
- }
/*
* Take ownership of the wakeup latch if we're going to sleep during
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory"),
- errdetail("Failed while allocating an XLog reading processor")));
+ errdetail("Failed while allocating an XLog reading processor.")));
xlogreader->system_identifier = ControlFile->system_identifier;
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
/*
- * Archive recovery was requested, and thanks to the backup label file,
- * we know how far we need to replay to reach consistency. Enter
+ * Archive recovery was requested, and thanks to the backup label
+ * file, we know how far we need to replay to reach consistency. Enter
* archive recovery directly.
*/
InArchiveRecovery = true;
wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
+ (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
/*
/*
* It's possible that archive recovery was requested, but we don't
* know how far we need to replay the WAL before we reach consistency.
- * This can happen for example if a base backup is taken from a running
- * server using an atomic filesystem snapshot, without calling
+ * This can happen for example if a base backup is taken from a
+ * running server using an atomic filesystem snapshot, without calling
* pg_start/stop_backup. Or if you just kill a running master server
* and put it into archive recovery by creating a recovery.conf file.
*
* replaying all the WAL present in pg_xlog, and only enter archive
* recovery after that.
*
- * But usually we already know how far we need to replay the WAL (up to
- * minRecoveryPoint, up to backupEndPoint, or until we see an
+ * But usually we already know how far we need to replay the WAL (up
+ * to minRecoveryPoint, up to backupEndPoint, or until we see an
* end-of-backup record), and we can enter archive recovery directly.
*/
if (ArchiveRecoveryRequested &&
{
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
+ (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
}
else if (StandbyMode)
{
{
ereport(LOG,
(errmsg("using previous checkpoint record at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
+ (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
}
else
* timeline in the history of the requested timeline, we cannot proceed:
* the backup is not part of the history of the requested timeline.
*/
- Assert(expectedTLEs); /* was initialized by reading checkpoint record */
+ Assert(expectedTLEs); /* was initialized by reading checkpoint
+ * record */
if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
- checkPoint.ThisTimeLineID)
+ checkPoint.ThisTimeLineID)
{
- XLogRecPtr switchpoint;
+ XLogRecPtr switchpoint;
/*
- * tliSwitchPoint will throw an error if the checkpoint's timeline
- * is not in expectedTLEs at all.
+ * tliSwitchPoint will throw an error if the checkpoint's timeline is
+ * not in expectedTLEs at all.
*/
switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
ereport(FATAL,
(errmsg("requested timeline %u is not a child of this server's history",
recoveryTargetTLI),
- errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
+ errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
(uint32) (ControlFile->checkPoint >> 32),
(uint32) ControlFile->checkPoint,
ControlFile->checkPointCopy.ThisTimeLineID,
* history, too.
*/
if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
- tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
- ControlFile->minRecoveryPointTLI)
+ tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+ ControlFile->minRecoveryPointTLI)
ereport(FATAL,
(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
recoveryTargetTLI,
ereport(DEBUG1,
(errmsg("redo record is at %X/%X; shutdown %s",
- (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
+ (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
wasShutdown ? "TRUE" : "FALSE")));
ereport(DEBUG1,
(errmsg("next transaction ID: %u/%u; next OID: %u",
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
XLogCtl->ckptXid = checkPoint.nextXid;
+ /*
+ * Initialize replication slots, before there's a chance to remove
+ * required resources.
+ */
+ StartupReplicationSlots();
+
+ /*
+ * Startup logical state, needs to be setup now so we have proper data
+ * during crash recovery.
+ */
+ StartupReorderBuffer();
+
+ /*
+ * Startup MultiXact. We need to do this early for two reasons: one is
+ * that we might try to access multixacts when we do tuple freezing, and
+ * the other is we need its state initialized because we attempt
+ * truncation during restartpoints.
+ */
+ StartupMultiXact();
+
/*
* Initialize unlogged LSN. On a clean shutdown, it's restored from the
* control file. On recovery, all unlogged relations are blown away, so
ThisTimeLineID = checkPoint.ThisTimeLineID;
/*
- * Copy any missing timeline history files between 'now' and the
- * recovery target timeline from archive to pg_xlog. While we don't need
- * those files ourselves - the history file of the recovery target
- * timeline covers all the previous timelines in the history too - a
- * cascading standby server might be interested in them. Or, if you
- * archive the WAL from this server to a different archive than the
- * master, it'd be good for all the history files to get archived there
- * after failover, so that you can use one of the old timelines as a
- * PITR target. Timeline history files are small, so it's better to copy
- * them unnecessarily than not copy them and regret later.
+ * Copy any missing timeline history files between 'now' and the recovery
+ * target timeline from archive to pg_xlog. While we don't need those
+ * files ourselves - the history file of the recovery target timeline
+ * covers all the previous timelines in the history too - a cascading
+ * standby server might be interested in them. Or, if you archive the WAL
+ * from this server to a different archive than the master, it'd be good
+ * for all the history files to get archived there after failover, so that
+ * you can use one of the old timelines as a PITR target. Timeline history
+ * files are small, so it's better to copy them unnecessarily than not
+ * copy them and regret later.
*/
restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
lastFullPageWrites = checkPoint.fullPageWrites;
- RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+ RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
if (RecPtr < checkPoint.redo)
ereport(PANIC,
ereport(LOG,
(errmsg("database system was not properly shut down; "
"automatic recovery in progress")));
- if (recoveryTargetTLI > 0)
+ if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
ereport(LOG,
- (errmsg("crash recovery starts in timeline %u "
- "and has target timeline %u",
- ControlFile->checkPointCopy.ThisTimeLineID,
- recoveryTargetTLI)));
+ (errmsg("crash recovery starts in timeline %u "
+ "and has target timeline %u",
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ recoveryTargetTLI)));
ControlFile->state = DB_IN_CRASH_RECOVERY;
}
ControlFile->prevCheckPoint = ControlFile->checkPoint;
oldestActiveXID = checkPoint.oldestActiveXid;
Assert(TransactionIdIsValid(oldestActiveXID));
+ /* Tell procarray about the range of xids it has to deal with */
+ ProcArrayInitRecovery(ShmemVariableCache->nextXid);
+
/*
- * Startup commit log and subtrans only. Other SLRUs are not
- * maintained during recovery and need not be started yet.
+ * Startup commit log and subtrans only. MultiXact has already
+ * been started up and other SLRUs are not maintained during
+ * recovery and need not be started yet.
*/
StartupCLOG();
StartupSUBTRANS(oldestActiveXID);
}
/*
- * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
- * recoveryLastXTime.
- *
- * This is slightly confusing if we're starting from an online
- * checkpoint; we've just read and replayed the chekpoint record, but
- * we're going to start replay from its redo pointer, which precedes
- * the location of the checkpoint record itself. So even though the
- * last record we've replayed is indeed ReadRecPtr, we haven't
- * replayed all the preceding records yet. That's OK for the current
- * use of these variables.
+ * Initialize shared variables for tracking progress of WAL replay, as
+ * if we had just replayed the record before the REDO location (or the
+ * checkpoint record itself, if it's a shutdown checkpoint).
*/
SpinLockAcquire(&xlogctl->info_lck);
- xlogctl->replayEndRecPtr = ReadRecPtr;
+ if (checkPoint.redo < RecPtr)
+ xlogctl->replayEndRecPtr = checkPoint.redo;
+ else
+ xlogctl->replayEndRecPtr = EndRecPtr;
xlogctl->replayEndTLI = ThisTimeLineID;
- xlogctl->lastReplayedEndRecPtr = EndRecPtr;
- xlogctl->lastReplayedTLI = ThisTimeLineID;
+ xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
+ xlogctl->lastReplayedTLI = xlogctl->replayEndTLI;
xlogctl->recoveryLastXTime = 0;
xlogctl->currentChunkStartTime = 0;
xlogctl->recoveryPause = false;
if (record != NULL)
{
- bool recoveryContinue = true;
- bool recoveryApply = true;
ErrorContextCallback errcallback;
TimestampTz xtime;
ereport(LOG,
(errmsg("redo starts at %X/%X",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
/*
* main redo apply loop
*/
do
{
- bool switchedTLI = false;
+ bool switchedTLI = false;
+
#ifdef WAL_DEBUG
if (XLOG_DEBUG ||
(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
initStringInfo(&buf);
appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
- (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
+ (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
xlog_outrec(&buf, record);
- appendStringInfo(&buf, " - ");
- RmgrTable[record->xl_rmid].rm_desc(&buf,
- record->xl_info,
- XLogRecGetData(record));
+ appendStringInfoString(&buf, " - ");
+ RmgrTable[record->xl_rmid].rm_desc(&buf, record);
elog(LOG, "%s", buf.data);
pfree(buf.data);
}
/*
* Have we reached our recovery target?
*/
- if (recoveryStopsHere(record, &recoveryApply))
+ if (recoveryStopsBefore(record))
{
- if (recoveryPauseAtTarget)
- {
- SetRecoveryPause(true);
- recoveryPausesHere();
- }
reachedStopPoint = true; /* see below */
- recoveryContinue = false;
+ break;
+ }
- /* Exit loop if we reached non-inclusive recovery target */
- if (!recoveryApply)
- break;
+ /*
+ * If we've been asked to lag the master, wait on latch until
+ * enough time has passed.
+ */
+ if (recoveryApplyDelay(record))
+ {
+ /*
+ * We test for paused recovery again here. If user sets
+ * delayed apply, it may be because they expect to pause
+ * recovery in case of problems, so we must test again
+ * here otherwise pausing during the delay-wait wouldn't
+ * work.
+ */
+ if (xlogctl->recoveryPause)
+ recoveryPausesHere();
}
/* Setup error traceback support for ereport() */
}
/*
- * Before replaying this record, check if this record
- * causes the current timeline to change. The record is
- * already considered to be part of the new timeline,
- * so we update ThisTimeLineID before replaying it.
- * That's important so that replayEndTLI, which is
- * recorded as the minimum recovery point's TLI if
- * recovery stops after this record, is set correctly.
+ * Before replaying this record, check if this record causes
+ * the current timeline to change. The record is already
+ * considered to be part of the new timeline, so we update
+ * ThisTimeLineID before replaying it. That's important so
+ * that replayEndTLI, which is recorded as the minimum
+ * recovery point's TLI if recovery stops after this record,
+ * is set correctly.
*/
if (record->xl_rmid == RM_XLOG_ID)
{
}
else if (info == XLOG_END_OF_RECOVERY)
{
- xl_end_of_recovery xlrec;
+ xl_end_of_recovery xlrec;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
newTLI = xlrec.ThisTimeLineID;
WalSndWakeup();
/* Exit loop if we reached inclusive recovery target */
- if (!recoveryContinue)
+ if (recoveryStopsAfter(record))
+ {
+ reachedStopPoint = true;
break;
+ }
/* Else, try to fetch the next WAL record */
record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
* end of main redo apply loop
*/
+ if (recoveryPauseAtTarget && reachedStopPoint)
+ {
+ SetRecoveryPause(true);
+ recoveryPausesHere();
+ }
+
+ /* Allow resource managers to do any required cleanup. */
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (RmgrTable[rmid].rm_cleanup != NULL)
+ RmgrTable[rmid].rm_cleanup();
+ }
+
ereport(LOG,
(errmsg("redo done at %X/%X",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
xtime = GetLatestXTime();
if (xtime)
ereport(LOG,
/*
* Consider whether we need to assign a new timeline ID.
*
- * If we are doing an archive recovery, we always assign a new ID. This
- * handles a couple of issues. If we stopped short of the end of WAL
+ * If we are doing an archive recovery, we always assign a new ID. This
+ * handles a couple of issues. If we stopped short of the end of WAL
* during recovery, then we are clearly generating a new timeline and must
* assign it a unique new ID. Even if we ran to the end, modifying the
* current last segment is problematic because it may result in trying to
PrevTimeLineID = ThisTimeLineID;
if (ArchiveRecoveryRequested)
{
- char reason[200];
+ char reason[200];
Assert(InArchiveRecovery);
snprintf(reason, sizeof(reason),
"at restore point \"%s\"",
recoveryStopName);
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ snprintf(reason, sizeof(reason), "reached consistency");
else
snprintf(reason, sizeof(reason), "no recovery target specified");
openLogFile = XLogFileOpen(openLogSegNo);
openLogOff = 0;
Insert = &XLogCtl->Insert;
- Insert->PrevRecord = LastRec;
- XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
+ Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
+ Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
/*
* Tricky point here: readBuf contains the *last* block that the LastRec
- * record spans, not the one it starts in. The last block is indeed the
+ * record spans, not the one it starts in. The last block is indeed the
* one we want to use.
*/
- if (EndOfLog % XLOG_BLCKSZ == 0)
+ if (EndOfLog % XLOG_BLCKSZ != 0)
{
- memset(Insert->currpage, 0, XLOG_BLCKSZ);
+ char *page;
+ int len;
+ int firstIdx;
+ XLogRecPtr pageBeginPtr;
+
+ pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
+ Assert(readOff == pageBeginPtr % XLogSegSize);
+
+ firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+
+ /* Copy the valid part of the last block, and zero the rest */
+ page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+ len = EndOfLog % XLOG_BLCKSZ;
+ memcpy(page, xlogreader->readBuf, len);
+ memset(page + len, 0, XLOG_BLCKSZ - len);
+
+ XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
+ XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
}
else
{
- Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
- memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
+ /*
+ * There is no partial block to copy. Just set InitializedUpTo, and
+ * let the first attempt to insert a log record to initialize the next
+ * buffer.
+ */
+ XLogCtl->InitializedUpTo = EndOfLog;
}
- Insert->currpos = (char *) Insert->currpage +
- (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);
LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
XLogCtl->LogwrtRqst.Write = EndOfLog;
XLogCtl->LogwrtRqst.Flush = EndOfLog;
- freespace = INSERT_FREESPACE(Insert);
- if (freespace > 0)
- {
- /* Make sure rest of page is zero */
- MemSet(Insert->currpos, 0, freespace);
- XLogCtl->Write.curridx = 0;
- }
- else
- {
- /*
- * Whenever LogwrtResult points to exactly the end of a page,
- * Write.curridx must point to the *next* page (see XLogWrite()).
- *
- * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
- * this is sufficient. The first actual attempt to insert a log
- * record will advance the insert state.
- */
- XLogCtl->Write.curridx = NextBufIdx(0);
- }
-
/* Pre-scan prepared transactions to find out the range of XIDs present */
oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
if (InRecovery)
{
- int rmid;
-
- /*
- * Resource managers might need to write WAL records, eg, to record
- * index cleanup actions. So temporarily enable XLogInsertAllowed in
- * this process only.
- */
- LocalSetXLogInsertAllowed();
-
- /*
- * Allow resource managers to do any required cleanup.
- */
- for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
- {
- if (RmgrTable[rmid].rm_cleanup != NULL)
- RmgrTable[rmid].rm_cleanup();
- }
-
- /* Disallow XLogInsert again */
- LocalXLogInsertAllowed = -1;
-
/*
* Perform a checkpoint to update all our recovery activity to disk.
*
* allows some extra error checking in xlog_redo.
*
* In fast promotion, only create a lightweight end-of-recovery record
- * instead of a full checkpoint. A checkpoint is requested later, after
- * we're fully out of recovery mode and already accepting queries.
+ * instead of a full checkpoint. A checkpoint is requested later,
+ * after we're fully out of recovery mode and already accepting
+ * queries.
*/
if (bgwriterLaunched)
{
if (record != NULL)
{
fast_promoted = true;
+
+ /*
+ * Insert a special WAL record to mark the end of
+ * recovery, since we aren't doing a checkpoint. That
+ * means that the checkpointer process may likely be in
+ * the middle of a time-smoothed restartpoint and could
+ * continue to be for minutes after this. That sounds
+ * strange, but the effect is roughly the same and it
+ * would be stranger to try to come out of the
+ * restartpoint and then checkpoint. We request a
+ * checkpoint later anyway, just for safety.
+ */
CreateEndOfRecoveryRecord();
}
}
if (!fast_promoted)
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
- CHECKPOINT_IMMEDIATE |
- CHECKPOINT_WAIT);
+ CHECKPOINT_IMMEDIATE |
+ CHECKPOINT_WAIT);
}
else
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
LWLockRelease(ControlFileLock);
/* start the archive_timeout timer running */
- XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
/* also initialize latestCompletedXid, to nextXid - 1 */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
/*
* Perform end of recovery actions for any SLRUs that need it.
*/
- StartupMultiXact();
TrimCLOG();
+ TrimMultiXact();
/* Reload shared-memory state for prepared transactions */
RecoverPreparedTransactions();
XLogReportParameters();
/*
- * All done. Allow backends to write WAL. (Although the bool flag is
+ * All done. Allow backends to write WAL. (Although the bool flag is
* probably atomic in itself, we use the info_lck here to ensure that
* there are no race conditions concerning visibility of other recent
* updates to shared memory.)
}
/*
- * If there were cascading standby servers connected to us, nudge any
- * wal sender processes to notice that we've been promoted.
+ * If there were cascading standby servers connected to us, nudge any wal
+ * sender processes to notice that we've been promoted.
*/
WalSndWakeup();
* than is appropriate now that we're not in standby mode anymore.
*/
if (fast_promoted)
- RequestCheckpoint(0);
+ RequestCheckpoint(CHECKPOINT_FORCE);
}
/*
static void
CheckRecoveryConsistency(void)
{
+ XLogRecPtr lastReplayedEndRecPtr;
+
/*
* During crash recovery, we don't reach a consistent state until we've
* replayed all the WAL.
if (XLogRecPtrIsInvalid(minRecoveryPoint))
return;
+ /*
+ * assume that we are called in the startup process, and hence don't need
+ * a lock to read lastReplayedEndRecPtr
+ */
+ lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
+
/*
* Have we reached the point where our base backup was completed?
*/
if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
- ControlFile->backupEndPoint <= EndRecPtr)
+ ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
{
/*
* We have reached the end of base backup, as indicated by pg_control.
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
- if (ControlFile->minRecoveryPoint < EndRecPtr)
- ControlFile->minRecoveryPoint = EndRecPtr;
+ if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
+ ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
ControlFile->backupStartPoint = InvalidXLogRecPtr;
ControlFile->backupEndPoint = InvalidXLogRecPtr;
}
/*
- * Have we passed our safe starting point? Note that minRecoveryPoint
- * is known to be incorrectly set if ControlFile->backupEndRequired,
- * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
+ * Have we passed our safe starting point? Note that minRecoveryPoint is
+ * known to be incorrectly set if ControlFile->backupEndRequired, until
+ * the XLOG_BACKUP_RECORD arrives to advise us of the correct
* minRecoveryPoint. All we know prior to that is that we're not
* consistent yet.
*/
if (!reachedConsistency && !ControlFile->backupEndRequired &&
- minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
+ minRecoveryPoint <= lastReplayedEndRecPtr &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{
/*
reachedConsistency = true;
ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X",
- (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
- (uint32) XLogCtl->lastReplayedEndRecPtr)));
+ (uint32) (lastReplayedEndRecPtr >> 32),
+ (uint32) lastReplayedEndRecPtr)));
}
/*
return false;
else
{
- /* use volatile pointer to prevent code rearrangement */
+ /*
+ * use volatile pointer to make sure we make a fresh read of the
+ * shared variable.
+ */
volatile XLogCtlData *xlogctl = XLogCtl;
- /* spinlock is essential on machines with weak memory ordering! */
- SpinLockAcquire(&xlogctl->info_lck);
LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
- SpinLockRelease(&xlogctl->info_lck);
/*
* Initialize TimeLineID and RedoRecPtr when we discover that recovery
* is finished. InitPostgres() relies upon this behaviour to ensure
- * that InitXLOGAccess() is called at backend startup. (If you change
+ * that InitXLOGAccess() is called at backend startup. (If you change
* this, see also LocalSetXLogInsertAllowed.)
*/
if (!LocalRecoveryInProgress)
+ {
+ /*
+ * If we just exited recovery, make sure we read TimeLineID and
+ * RedoRecPtr after SharedRecoveryInProgress (for machines with
+ * weak memory ordering).
+ */
+ pg_memory_barrier();
InitXLOGAccess();
+ }
+
+ /*
+ * Note: We don't need a memory barrier when we're still in recovery.
+ * We might exit recovery immediately after return, so the caller
+ * can't rely on 'true' meaning that we're still in recovery anyway.
+ */
return LocalRecoveryInProgress;
}
* true. Postmaster knows this by way of signal, not via shared memory.
*
* Unlike testing standbyState, this works in any process that's connected to
- * shared memory.
+ * shared memory. (And note that standbyState alone doesn't tell the truth
+ * anyway.)
*/
bool
HotStandbyActive(void)
}
}
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+bool
+HotStandbyActiveInReplay(void)
+{
+ Assert(AmStartupProcess());
+ return LocalHotStandbyActive;
+}
+
/*
* Is this process allowed to insert new WAL records?
*
ThisTimeLineID = XLogCtl->ThisTimeLineID;
Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
+ /* Initialize our copy of WALInsertLocks and register the tranche */
+ WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
+ LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
+ &XLogCtl->Insert.WALInsertLockTranche);
+
/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
(void) GetRedoRecPtr();
}
/*
- * Once spawned, a backend may update its local RedoRecPtr from
- * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
- * to do so. This is done in XLogInsert() or GetRedoRecPtr().
+ * Return the current Redo pointer from shared memory.
+ *
+ * As a side-effect, the local RedoRecPtr copy is updated.
*/
XLogRecPtr
GetRedoRecPtr(void)
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogRecPtr ptr;
+ /*
+ * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+ * grabbed a WAL insertion lock to read the master copy, someone might
+ * update it just after we've released the lock.
+ */
SpinLockAcquire(&xlogctl->info_lck);
- Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
- RedoRecPtr = xlogctl->Insert.RedoRecPtr;
+ ptr = xlogctl->RedoRecPtr;
SpinLockRelease(&xlogctl->info_lck);
+ if (RedoRecPtr < ptr)
+ RedoRecPtr = ptr;
+
return RedoRecPtr;
}
*
* NOTE: The value *actually* returned is the position of the last full
* xlog page. It lags behind the real insert position by at most 1 page.
- * For that, we don't need to acquire WALInsertLock which can be quite
- * heavily contended, and an approximation is enough for the current
- * usage of this function.
+ * For that, we don't need to scan through WAL insertion locks, and an
+ * approximation is enough for the current usage of this function.
*/
XLogRecPtr
GetInsertRecPtr(void)
/* Need WALWriteLock, but shared lock is sufficient */
LWLockAcquire(WALWriteLock, LW_SHARED);
- result = XLogCtl->Write.lastSegSwitchTime;
+ result = XLogCtl->lastSegSwitchTime;
LWLockRelease(WALWriteLock);
return result;
void
ShutdownXLOG(int code, Datum arg)
{
- ereport(LOG,
+ /* Don't be chatty in standalone mode */
+ ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("shutting down")));
if (RecoveryInProgress())
ShutdownSUBTRANS();
ShutdownMultiXact();
- ereport(LOG,
+ /* Don't be chatty in standalone mode */
+ ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("database system is shut down")));
}
void
CreateCheckPoint(int flags)
{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
bool shutdown;
CheckPoint checkPoint;
XLogRecPtr recptr;
XLogRecData rdata;
uint32 freespace;
XLogSegNo _logSegNo;
+ XLogRecPtr curInsert;
VirtualTransactionId *vxids;
- int nvxids;
+ int nvxids;
/*
* An end-of-recovery checkpoint is really a shutdown checkpoint, just
checkPoint.oldestActiveXid = InvalidTransactionId;
/*
- * We must hold WALInsertLock while examining insert state to determine
- * the checkpoint REDO pointer.
+ * We must block concurrent insertions while examining insert state to
+ * determine the checkpoint REDO pointer.
*/
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
+ curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
/*
* If this isn't a shutdown or forced checkpoint, and we have not inserted
* any XLOG records since the start of the last checkpoint, skip the
- * checkpoint. The idea here is to avoid inserting duplicate checkpoints
+ * checkpoint. The idea here is to avoid inserting duplicate checkpoints
* when the system is idle. That wastes log space, and more importantly it
* exposes us to possible loss of both current and previous checkpoint
* records if the machine crashes just as we're writing the update.
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
{
- XLogRecPtr curInsert;
-
- INSERT_RECPTR(curInsert, Insert, Insert->curridx);
- if (curInsert == ControlFile->checkPoint +
+ if (curInsert == ControlFile->checkPoint +
MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
{
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
return;
* the buffer flush work. Those XLOG records are logically after the
* checkpoint, even though physically before it. Got that?
*/
- freespace = INSERT_FREESPACE(Insert);
+ freespace = INSERT_FREESPACE(curInsert);
if (freespace == 0)
{
- (void) AdvanceXLInsertBuffer(false);
- /* OK to ignore update return flag, since we will do flush anyway */
- freespace = INSERT_FREESPACE(Insert);
+ if (curInsert % XLogSegSize == 0)
+ curInsert += SizeOfXLogLongPHD;
+ else
+ curInsert += SizeOfXLogShortPHD;
}
- INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
+ checkPoint.redo = curInsert;
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls; this
- * must be done while holding the insert lock AND the info_lck.
+ * must be done while holding all the insertion locks.
*
* Note: if we fail to complete the checkpoint, RedoRecPtr will be left
* pointing past where it really needs to point. This is okay; the only
* XLogInserts that happen while we are dumping buffers must assume that
* their buffer changes are not included in the checkpoint.
*/
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
-
- SpinLockAcquire(&xlogctl->info_lck);
- RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
- SpinLockRelease(&xlogctl->info_lck);
- }
+ RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
/*
- * Now we can release WAL insert lock, allowing other xacts to proceed
- * while we are flushing disk buffers.
+ * Now we can release the WAL insertion locks, allowing other xacts to
+ * proceed while we are flushing disk buffers.
*/
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
+
+ /* Update the info_lck-protected copy of RedoRecPtr as well */
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->RedoRecPtr = checkPoint.redo;
+ SpinLockRelease(&xlogctl->info_lck);
/*
* If enabled, log checkpoint start. We postpone this until now so as not
TRACE_POSTGRESQL_CHECKPOINT_START(flags);
- /*
- * In some cases there are groups of actions that must all occur on
- * one side or the other of a checkpoint record. Before flushing the
- * checkpoint record we must explicitly wait for any backend currently
- * performing those groups of actions.
- *
- * One example is end of transaction, so we must wait for any transactions
- * that are currently in commit critical sections. If an xact inserted
- * its commit record into XLOG just before the REDO point, then a crash
- * restart from the REDO point would not replay that record, which means
- * that our flushing had better include the xact's update of pg_clog. So
- * we wait till he's out of his commit critical section before proceeding.
- * See notes in RecordTransactionCommit().
- *
- * Because we've already released WALInsertLock, this test is a bit fuzzy:
- * it is possible that we will wait for xacts we didn't really need to
- * wait for. But the delay should be short and it seems better to make
- * checkpoint take a bit longer than to hold locks longer than necessary.
- * (In fact, the whole reason we have this issue is that xact.c does
- * commit record XLOG insertion and clog update as two separate steps
- * protected by different locks, but again that seems best on grounds of
- * minimizing lock contention.)
- *
- * A transaction that has not yet set delayChkpt when we look cannot be at
- * risk, since he's not inserted his commit record yet; and one that's
- * already cleared it is not at risk either, since he's done fixing clog
- * and we will correctly flush the update below. So we cannot miss any
- * xacts we need to wait for.
- */
- vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
- if (nvxids > 0)
- {
- uint32 nwaits = 0;
-
- do
- {
- pg_usleep(10000L); /* wait for 10 msec */
- nwaits++;
- } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
- }
- pfree(vxids);
-
/*
* Get the other info we need for the checkpoint record.
*/
*/
END_CRIT_SECTION();
+ /*
+ * In some cases there are groups of actions that must all occur on one
+ * side or the other of a checkpoint record. Before flushing the
+ * checkpoint record we must explicitly wait for any backend currently
+ * performing those groups of actions.
+ *
+ * One example is end of transaction, so we must wait for any transactions
+ * that are currently in commit critical sections. If an xact inserted
+ * its commit record into XLOG just before the REDO point, then a crash
+ * restart from the REDO point would not replay that record, which means
+ * that our flushing had better include the xact's update of pg_clog. So
+ * we wait till he's out of his commit critical section before proceeding.
+ * See notes in RecordTransactionCommit().
+ *
+ * Because we've already released the insertion locks, this test is a bit
+ * fuzzy: it is possible that we will wait for xacts we didn't really need
+ * to wait for. But the delay should be short and it seems better to make
+ * checkpoint take a bit longer than to hold off insertions longer than
+ * necessary. (In fact, the whole reason we have this issue is that xact.c
+ * does commit record XLOG insertion and clog update as two separate steps
+ * protected by different locks, but again that seems best on grounds of
+ * minimizing lock contention.)
+ *
+ * A transaction that has not yet set delayChkpt when we look cannot be at
+ * risk, since he's not inserted his commit record yet; and one that's
+ * already cleared it is not at risk either, since he's done fixing clog
+ * and we will correctly flush the update below. So we cannot miss any
+ * xacts we need to wait for.
+ */
+ vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+ if (nvxids > 0)
+ {
+ do
+ {
+ pg_usleep(10000L); /* wait for 10 msec */
+ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
+ }
+ pfree(vxids);
+
CheckPointGuts(checkPoint.redo, flags);
/*
*/
END_CRIT_SECTION();
+ /*
+ * Now that the checkpoint is safely on disk, we can update the point to
+ * which multixact can be truncated.
+ */
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
+
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
/*
* Truncate pg_subtrans if possible. We can throw away all data before
- * the oldest XMIN of any running transaction. No future transaction will
+ * the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
- * in subtrans.c). During recovery, though, we mustn't do this because
+ * in subtrans.c). During recovery, though, we mustn't do this because
* StartupSUBTRANS hasn't been called yet.
*/
if (!RecoveryInProgress())
- TruncateSUBTRANS(GetOldestXmin(true, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, false));
+
+ /*
+ * Truncate pg_multixact too.
+ */
+ TruncateMultiXact();
/* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false);
* CreateRestartPoint() allows for the case where recovery may end before
* the restartpoint completes so there is no concern of concurrent behaviour.
*/
-void
+static void
CreateEndOfRecoveryRecord(void)
{
- xl_end_of_recovery xlrec;
- XLogRecData rdata;
- XLogRecPtr recptr;
+ xl_end_of_recovery xlrec;
+ XLogRecData rdata;
+ XLogRecPtr recptr;
/* sanity check */
if (!RecoveryInProgress())
xlrec.end_time = time(NULL);
- LWLockAcquire(WALInsertLock, LW_SHARED);
+ WALInsertLockAcquireExclusive();
xlrec.ThisTimeLineID = ThisTimeLineID;
xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
LocalSetXLogInsertAllowed();
XLogFlush(recptr);
/*
- * Update the control file so that crash recovery can follow
- * the timeline changes to this point.
+ * Update the control file so that crash recovery can follow the timeline
+ * changes to this point.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->time = (pg_time_t) xlrec.end_time;
END_CRIT_SECTION();
- LocalXLogInsertAllowed = -1; /* return to "check" state */
+ LocalXLogInsertAllowed = -1; /* return to "check" state */
}
/*
CheckPointMultiXact();
CheckPointPredicate();
CheckPointRelationMap();
+ CheckPointReplicationSlots();
+ CheckPointSnapBuild();
+ CheckPointLogicalRewriteHeap();
CheckPointBuffers(flags); /* performs all required fsyncs */
/* We deliberately delay 2PC checkpointing as long as possible */
CheckPointTwoPhase(checkPointRedo);
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
- int rmid;
-
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
- /*
- * Is it safe to restartpoint? We must ask each of the resource managers
- * whether they have any partial state information that might prevent a
- * correct restart from this point. If so, we skip this opportunity, but
- * return at the next checkpoint record for another try.
- */
- for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
- {
- if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
- if (!(RmgrTable[rmid].rm_safe_restartpoint()))
- {
- elog(trace_recovery(DEBUG2),
- "RM %d not safe to record restart point at %X/%X",
- rmid,
- (uint32) (checkPoint->redo >> 32),
- (uint32) checkPoint->redo);
- return;
- }
- }
-
/*
* Also refrain from creating a restartpoint if we have seen any
* references to non-existent pages. Restarting recovery from the
{
ereport(DEBUG2,
(errmsg("skipping restartpoint, already performed at %X/%X",
- (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));
+ (uint32) (lastCheckPoint.redo >> 32),
+ (uint32) lastCheckPoint.redo)));
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
if (flags & CHECKPOINT_IS_SHUTDOWN)
* the number of segments replayed since last restartpoint, and request a
* restartpoint if it exceeds checkpoint_segments.
*
- * You need to hold WALInsertLock and info_lck to update it, although
- * during recovery acquiring WALInsertLock is just pro forma, because
- * there is no other processes updating Insert.RedoRecPtr.
+ * Like in CreateCheckPoint(), hold off insertions to update it, although
+ * during recovery this is just pro forma, because no WAL insertions are
+ * happening.
*/
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
- SpinLockAcquire(&xlogctl->info_lck);
+ WALInsertLockAcquireExclusive();
xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
+ WALInsertLockRelease();
+
+ /* Also update the info_lck-protected copy */
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->RedoRecPtr = lastCheckPoint.redo;
SpinLockRelease(&xlogctl->info_lck);
- LWLockRelease(WALInsertLock);
/*
* Prepare to accumulate statistics.
{
XLogRecPtr receivePtr;
XLogRecPtr replayPtr;
+ TimeLineID replayTLI;
XLogRecPtr endptr;
/*
- * Get the current end of xlog replayed or received, whichever is later.
+ * Get the current end of xlog replayed or received, whichever is
+ * later.
*/
receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
- replayPtr = GetXLogReplayRecPtr(NULL);
+ replayPtr = GetXLogReplayRecPtr(&replayTLI);
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
KeepLogSeg(endptr, &_logSegNo);
_logSegNo--;
/*
- * Update ThisTimeLineID to the timeline we're currently replaying,
- * so that we install any recycled segments on that timeline.
+ * Try to recycle segments on a useful timeline. If we've been
+ * promoted since the beginning of this restartpoint, use the new
+ * timeline chosen at end of recovery (RecoveryInProgress() sets
+ * ThisTimeLineID in that case). If we're still in recovery, use the
+ * timeline we're currently replaying.
*
* There is no guarantee that the WAL segments will be useful on the
* current timeline; if recovery proceeds to a new timeline right
* not be used, and will go wasted until recycled on the next
* restartpoint. We'll live with that.
*/
- (void) GetXLogReplayRecPtr(&ThisTimeLineID);
+ if (RecoveryInProgress())
+ ThisTimeLineID = replayTLI;
RemoveOldXlogFiles(_logSegNo, endptr);
* segments, since that may supply some of the needed files.)
*/
PreallocXlogFiles(endptr);
+
+ /*
+ * ThisTimeLineID is normally not set when we're still in recovery.
+ * However, recycling/preallocating segments above needed
+ * ThisTimeLineID to determine which timeline to install the segments
+ * on. Reset it now, to restore the normal state of affairs for
+ * debugging purposes.
+ */
+ if (RecoveryInProgress())
+ ThisTimeLineID = 0;
}
+ /*
+ * Due to an historical accident multixact truncations are not WAL-logged,
+ * but just performed everytime the mxact horizon is increased. So, unless
+ * we explicitly execute truncations on a standby it will never clean out
+ * /pg_multixact which obviously is bad, both because it uses space and
+ * because we can wrap around into pre-existing data...
+ *
+ * We can only do the truncation here, after the UpdateControlFile()
+ * above, because we've now safely established a restart point. That
+ * guarantees we will not need to access those multis.
+ *
+ * It's probably worth improving this.
+ */
+ TruncateMultiXact();
+
/*
* Truncate pg_subtrans if possible. We can throw away all data before
- * the oldest XMIN of any running transaction. No future transaction will
+ * the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
- * in subtrans.c). When hot standby is disabled, though, we mustn't do
+ * in subtrans.c). When hot standby is disabled, though, we mustn't do
* this because StartupSUBTRANS hasn't been called yet.
*/
if (EnableHotStandby)
- TruncateSUBTRANS(GetOldestXmin(true, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, false));
/* Real work is done, but log and update before releasing lock. */
LogCheckpointEnd(true);
xtime = GetLatestXTime();
ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
- (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
+ (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
xtime ? errdetail("last completed transaction was at log time %s",
timestamptz_to_str(xtime)) : 0));
}
/*
- * Calculate the last segment that we need to retain because of
- * wal_keep_segments, by subtracting wal_keep_segments from
- * the given xlog location, recptr.
+ * Retreat *logSegNo to the last segment that we need to retain because of
+ * either wal_keep_segments or replication slots.
+ *
+ * This is calculated by subtracting wal_keep_segments from the given xlog
+ * location, recptr and by making sure that that result is below the
+ * requirement of replication slots.
*/
static void
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
{
XLogSegNo segno;
-
- if (wal_keep_segments == 0)
- return;
+ XLogRecPtr keep;
XLByteToSeg(recptr, segno);
+ keep = XLogGetReplicationSlotMinimumLSN();
- /* avoid underflow, don't go below 1 */
- if (segno <= wal_keep_segments)
- segno = 1;
- else
- segno = *logSegNo - wal_keep_segments;
+ /* compute limit for wal_keep_segments first */
+ if (wal_keep_segments > 0)
+ {
+ /* avoid underflow, don't go below 1 */
+ if (segno <= wal_keep_segments)
+ segno = 1;
+ else
+ segno = segno - wal_keep_segments;
+ }
+
+ /* then check whether slots limit removal further */
+ if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
+ {
+ XLogRecPtr slotSegNo;
+
+ XLByteToSeg(keep, slotSegNo);
+
+ if (slotSegNo <= 0)
+ segno = 1;
+ else if (slotSegNo < segno)
+ segno = slotSegNo;
+ }
/* don't delete WAL segments newer than the calculated segment */
if (segno < *logSegNo)
* We need not flush the NEXTOID record immediately, because any of the
* just-allocated OIDs could only reach disk as part of a tuple insert or
* update that would have its own XLOG record that must follow the NEXTOID
- * record. Therefore, the standard buffer LSN interlock applied to those
+ * record. Therefore, the standard buffer LSN interlock applied to those
* records will ensure no such OID reaches disk before the NEXTOID record
* does.
*
xl_restore_point xlrec;
xlrec.rp_time = GetCurrentTimestamp();
- strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+ strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &xlrec;
return RecPtr;
}
+/*
+ * Write a backup block if needed when we are setting a hint. Note that
+ * this may be called for a variety of page types, not just heaps.
+ *
+ * Callable while holding just share lock on the buffer content.
+ *
+ * We can't use the plain backup block mechanism since that relies on the
+ * Buffer being exclusively locked. Since some modifications (setting LSN, hint
+ * bits) are allowed in a sharelocked buffer that can lead to wal checksum
+ * failures. So instead we copy the page and insert the copied data as normal
+ * record data.
+ *
+ * We only need to do something if page has not yet been full page written in
+ * this checkpoint round. The LSN of the inserted wal record is returned if we
+ * had to write, InvalidXLogRecPtr otherwise.
+ *
+ * It is possible that multiple concurrent backends could attempt to write WAL
+ * records. In that case, multiple copies of the same block would be recorded
+ * in separate WAL records by different backends, though that is still OK from
+ * a correctness perspective.
+ */
+XLogRecPtr
+XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
+{
+ XLogRecPtr recptr = InvalidXLogRecPtr;
+ XLogRecPtr lsn;
+ XLogRecData rdata[2];
+ BkpBlock bkpb;
+
+ /*
+ * Ensure no checkpoint can change our view of RedoRecPtr.
+ */
+ Assert(MyPgXact->delayChkpt);
+
+ /*
+ * Update RedoRecPtr so XLogCheckBuffer can make the right decision
+ */
+ GetRedoRecPtr();
+
+ /*
+ * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
+ * and reset rdata for any actual WAL record insert.
+ */
+ rdata[0].buffer = buffer;
+ rdata[0].buffer_std = buffer_std;
+
+ /*
+ * Check buffer while not holding an exclusive lock.
+ */
+ if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
+ {
+ char copied_buffer[BLCKSZ];
+ char *origdata = (char *) BufferGetBlock(buffer);
+
+ /*
+ * Copy buffer so we don't have to worry about concurrent hint bit or
+ * lsn updates. We assume pd_lower/upper cannot be changed without an
+ * exclusive lock, so the contents bkp are not racy.
+ *
+ * With buffer_std set to false, XLogCheckBuffer() sets hole_length
+ * and hole_offset to 0; so the following code is safe for either
+ * case.
+ */
+ memcpy(copied_buffer, origdata, bkpb.hole_offset);
+ memcpy(copied_buffer + bkpb.hole_offset,
+ origdata + bkpb.hole_offset + bkpb.hole_length,
+ BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
+
+ /*
+ * Header for backup block.
+ */
+ rdata[0].data = (char *) &bkpb;
+ rdata[0].len = sizeof(BkpBlock);
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+
+ /*
+ * Save copy of the buffer.
+ */
+ rdata[1].data = copied_buffer;
+ rdata[1].len = BLCKSZ - bkpb.hole_length;
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].next = NULL;
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
+ }
+
+ return recptr;
+}
+
/*
* Check if any of the GUC parameters that are critical for hot standby
* have changed, and update the value in pg_control file if necessary.
XLogReportParameters(void)
{
if (wal_level != ControlFile->wal_level ||
+ wal_log_hints != ControlFile->wal_log_hints ||
MaxConnections != ControlFile->MaxConnections ||
+ max_worker_processes != ControlFile->max_worker_processes ||
max_prepared_xacts != ControlFile->max_prepared_xacts ||
max_locks_per_xact != ControlFile->max_locks_per_xact)
{
{
XLogRecData rdata;
xl_parameter_change xlrec;
+ XLogRecPtr recptr;
xlrec.MaxConnections = MaxConnections;
+ xlrec.max_worker_processes = max_worker_processes;
xlrec.max_prepared_xacts = max_prepared_xacts;
xlrec.max_locks_per_xact = max_locks_per_xact;
xlrec.wal_level = wal_level;
+ xlrec.wal_log_hints = wal_log_hints;
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &xlrec;
rdata.len = sizeof(xlrec);
rdata.next = NULL;
- XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
+ XLogFlush(recptr);
}
ControlFile->MaxConnections = MaxConnections;
+ ControlFile->max_worker_processes = max_worker_processes;
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
+ ControlFile->wal_log_hints = wal_log_hints;
UpdateControlFile();
}
}
*/
if (fullPageWrites)
{
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
Insert->fullPageWrites = true;
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
}
/*
if (!fullPageWrites)
{
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
Insert->fullPageWrites = false;
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
}
END_CRIT_SECTION();
}
/* Check that the record agrees on what the current (old) timeline is */
if (prevTLI != ThisTimeLineID)
ereport(PANIC,
- (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
+ (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
prevTLI, ThisTimeLineID)));
+
/*
- * The new timeline better be in the list of timelines we expect
- * to see, according to the timeline history. It should also not
- * decrease.
+ * The new timeline better be in the list of timelines we expect to see,
+ * according to the timeline history. It should also not decrease.
*/
if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
ereport(PANIC,
- (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- newTLI, ThisTimeLineID)));
+ (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+ newTLI, ThisTimeLineID)));
/*
- * If we have not yet reached min recovery point, and we're about
- * to switch to a timeline greater than the timeline of the min
- * recovery point: trouble. After switching to the new timeline,
- * we could not possibly visit the min recovery point on the
- * correct timeline anymore. This can happen if there is a newer
- * timeline in the archive that branched before the timeline the
- * min recovery point is on, and you attempt to do PITR to the
- * new timeline.
+ * If we have not yet reached min recovery point, and we're about to
+ * switch to a timeline greater than the timeline of the min recovery
+ * point: trouble. After switching to the new timeline, we could not
+ * possibly visit the min recovery point on the correct timeline anymore.
+ * This can happen if there is a newer timeline in the archive that
+ * branched before the timeline the min recovery point is on, and you
+ * attempt to do PITR to the new timeline.
*/
if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
lsn < minRecoveryPoint &&
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
- /* Backup blocks are not used in xlog records */
+ /* Backup blocks are not used by XLOG rmgr */
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
if (info == XLOG_NEXTOID)
/*
* We used to try to take the maximum of ShmemVariableCache->nextOid
* and the recorded nextOid, but that fails if the OID counter wraps
- * around. Since no OID allocation should be happening during replay
+ * around. Since no OID allocation should be happening during replay
* anyway, better to just believe the record exactly. We still take
* OidGenLock while setting the variable, just in case.
*/
checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
checkPoint.oldestXidDB);
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
{
/* nothing to do here */
}
+ else if (info == XLOG_FPI)
+ {
+ char *data;
+ BkpBlock bkpb;
+
+ /*
+ * Full-page image (FPI) records contain a backup block stored
+ * "inline" in the normal data since the locking when writing hint
+ * records isn't sufficient to use the normal backup block mechanism,
+ * which assumes exclusive lock on the buffer supplied.
+ *
+ * Since the only change in these backup block are hint bits, there
+ * are no recovery conflicts generated.
+ *
+ * This also means there is no corresponding API call for this, so an
+ * smgr implementation has no need to implement anything. Which means
+ * nothing is needed in md.c etc
+ */
+ data = XLogRecGetData(record);
+ memcpy(&bkpb, data, sizeof(BkpBlock));
+ data += sizeof(BkpBlock);
+
+ RestoreBackupBlockContents(lsn, bkpb, data, false, false);
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->MaxConnections = xlrec.MaxConnections;
+ ControlFile->max_worker_processes = xlrec.max_worker_processes;
ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
ControlFile->wal_level = xlrec.wal_level;
+ ControlFile->wal_log_hints = wal_log_hints;
/*
* Update minRecoveryPoint to ensure that if recovery is aborted, we
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
- * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
+ * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
* disabled, otherwise the archive command or walsender process will read
* the WAL soon after writing it, which is guaranteed to cause a physical
* read if we bypassed the kernel cache. We also skip the
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync log segment %s: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo))));
+ XLogFileNameP(ThisTimeLineID, openLogSegNo))));
if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
XLogFileClose();
}
if (pg_fsync_writethrough(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync write-through log file %s: %m",
- XLogFileNameP(ThisTimeLineID, segno))));
+ errmsg("could not fsync write-through log file %s: %m",
+ XLogFileNameP(ThisTimeLineID, segno))));
break;
#endif
#ifdef HAVE_FDATASYNC
XLogFileNameP(TimeLineID tli, XLogSegNo segno)
{
char *result = palloc(MAXFNAMELEN);
+
XLogFileName(result, tli, segno);
return result;
}
*
* Every successfully started non-exclusive backup must be stopped by calling
* do_pg_stop_backup() or do_pg_abort_backup().
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
*/
XLogRecPtr
do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
backup_started_in_recovery = RecoveryInProgress();
- if (!superuser() && !is_authenticated_user_replication_role())
- ereport(ERROR,
- (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
- errmsg("must be superuser or replication role to run a backup")));
-
/*
* Currently only non-exclusive backup can be taken during recovery.
*/
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
- errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
+ errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
if (strlen(backupidstr) > MAXPGPATH)
ereport(ERROR,
* during an on-line backup even if not doing so at other times, because
* it's quite possible for the backup dump to obtain a "torn" (partially
* written) copy of a database page if it reads the page concurrently with
- * our write to the same page. This can be fixed as long as the first
+ * our write to the same page. This can be fixed as long as the first
* write to the page in the WAL sequence is a full-page write. Hence, we
* turn on forcePageWrites and then force a CHECKPOINT, to ensure there
* are no dirty pages in shared memory that might get dumped while the
* Note that forcePageWrites has no effect during an online backup from
* the standby.
*
- * We must hold WALInsertLock to change the value of forcePageWrites, to
- * ensure adequate interlocking against XLogInsert().
+ * We must hold all the insertion locks to change the value of
+ * forcePageWrites, to ensure adequate interlocking against XLogInsert().
*/
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
if (exclusive)
{
if (XLogCtl->Insert.exclusiveBackup)
{
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"),
else
XLogCtl->Insert.nonExclusiveBackups++;
XLogCtl->Insert.forcePageWrites = true;
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
/* Ensure we release forcePageWrites if fail below */
PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
* old timeline IDs. That would otherwise happen if you called
* pg_start_backup() right after restoring from a PITR archive: the
* first WAL segment containing the startup checkpoint has pages in
- * the beginning with the old timeline ID. That can cause trouble at
+ * the beginning with the old timeline ID. That can cause trouble at
* recovery: we won't have a history file covering the old timeline if
* pg_xlog directory was not included in the base backup and the WAL
* archive was cleared too before starting the backup.
bool checkpointfpw;
/*
- * Force a CHECKPOINT. Aside from being necessary to prevent torn
+ * Force a CHECKPOINT. Aside from being necessary to prevent torn
* page problems, this guarantees that two successive backup runs
* will have different checkpoint positions and hence different
* history file names, even if nothing happened in between.
* taking a checkpoint right after another is not that expensive
* either because only few buffers have been dirtied yet.
*/
- LWLockAcquire(WALInsertLock, LW_SHARED);
+ WALInsertLockAcquireExclusive();
if (XLogCtl->Insert.lastBackupStart < startpoint)
{
XLogCtl->Insert.lastBackupStart = startpoint;
gotUniqueStartpoint = true;
}
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
} while (!gotUniqueStartpoint);
XLByteToSeg(startpoint, _logSegNo);
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&stamp_time, log_timezone));
appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
- (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
+ (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
- (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
+ (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
exclusive ? "pg_start_backup" : "streamed");
appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
bool exclusive = DatumGetBool(arg);
/* Update backup counters and forcePageWrites on failure */
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
if (exclusive)
{
Assert(XLogCtl->Insert.exclusiveBackup);
{
XLogCtl->Insert.forcePageWrites = false;
}
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
}
/*
*
* Returns the last WAL position that must be present to restore from this
* backup, and the corresponding timeline ID in *stoptli_p.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
*/
XLogRecPtr
do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
backup_started_in_recovery = RecoveryInProgress();
- if (!superuser() && !is_authenticated_user_replication_role())
- ereport(ERROR,
- (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
- (errmsg("must be superuser or replication role to run a backup"))));
-
/*
* Currently only non-exclusive backup can be taken during recovery.
*/
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
- errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
+ errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
/*
* OK to update backup counters and forcePageWrites
*/
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
if (exclusive)
XLogCtl->Insert.exclusiveBackup = false;
else
{
XLogCtl->Insert.forcePageWrites = false;
}
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
if (exclusive)
{
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL generated with full_page_writes=off was replayed "
"during online backup"),
- errhint("This means that the backup being taken on the standby "
- "is corrupt and should not be used. "
+ errhint("This means that the backup being taken on the standby "
+ "is corrupt and should not be used. "
"Enable full_page_writes and run CHECKPOINT on the master, "
- "and then try an online backup again.")));
+ "and then try an online backup again.")));
LWLockAcquire(ControlFileLock, LW_SHARED);
errmsg("could not create file \"%s\": %m",
histfilepath)));
fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
- (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
+ (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
(uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
/* transfer remaining lines from label to history file */
* an error handler.
*
* NB: This is only for aborting a non-exclusive backup that doesn't write
- * backup_label. A backup started with pg_stop_backup() needs to be finished
+ * backup_label. A backup started with pg_start_backup() needs to be finished
* with pg_stop_backup().
*/
void
do_pg_abort_backup(void)
{
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ WALInsertLockAcquireExclusive();
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
XLogCtl->Insert.nonExclusiveBackups--;
{
XLogCtl->Insert.forcePageWrites = false;
}
- LWLockRelease(WALInsertLock);
+ WALInsertLockRelease();
}
/*
XLogRecPtr
GetXLogInsertRecPtr(void)
{
- XLogCtlInsert *Insert = &XLogCtl->Insert;
- XLogRecPtr current_recptr;
+ volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 current_bytepos;
- LWLockAcquire(WALInsertLock, LW_SHARED);
- INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
- LWLockRelease(WALInsertLock);
+ SpinLockAcquire(&Insert->insertpos_lck);
+ current_bytepos = Insert->CurrBytePos;
+ SpinLockRelease(&Insert->insertpos_lck);
- return current_recptr;
+ return XLogBytePosToRecPtr(current_bytepos);
}
/*
*
* If we see a backup_label during recovery, we assume that we are recovering
* from a backup dump file, and we therefore roll forward from the checkpoint
- * identified by the label file, NOT what pg_control says. This avoids the
+ * identified by the label file, NOT what pg_control says. This avoids the
* problem that pg_control might have been archived one or more checkpoints
* later than the start of the dump, and so if we rely on it as the start
* point, we will fail to restore a consistent database state.
StringInfoData buf;
initStringInfo(&buf);
- RmgrTable[record->xl_rmid].rm_desc(&buf,
- record->xl_info,
- XLogRecGetData(record));
+ RmgrTable[record->xl_rmid].rm_desc(&buf, record);
/* don't bother emitting empty description */
if (buf.len > 0)
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
{
XLogPageReadPrivate *private =
- (XLogPageReadPrivate *) xlogreader->private_data;
+ (XLogPageReadPrivate *) xlogreader->private_data;
int emode = private->emode;
uint32 targetPageOff;
- XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+ XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
XLByteToSeg(targetPagePtr, targetSegNo);
targetPageOff = targetPagePtr % XLogSegSize;
readOff = targetPageOff;
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
{
- char fname[MAXFNAMELEN];
+ char fname[MAXFNAMELEN];
XLogFileName(fname, curFileTLI, readSegNo);
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
- errmsg("could not seek in log segment %s to offset %u: %m",
+ errmsg("could not seek in log segment %s to offset %u: %m",
fname, readOff)));
goto next_record_is_invalid;
}
if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
{
- char fname[MAXFNAMELEN];
+ char fname[MAXFNAMELEN];
XLogFileName(fname, curFileTLI, readSegNo);
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
- errmsg("could not read from log segment %s, offset %u: %m",
+ errmsg("could not read from log segment %s, offset %u: %m",
fname, readOff)));
goto next_record_is_invalid;
}
* 'tliRecPtr' is the position of the WAL record we're interested in. It is
* used to decide which timeline to stream the requested WAL from.
*
- * If the the record is not immediately available, the function returns false
+ * If the record is not immediately available, the function returns false
* if we're not in standby mode. In standby mode, waits for it to become
* available.
*
bool fetching_ckpt, XLogRecPtr tliRecPtr)
{
static pg_time_t last_fail_time = 0;
- pg_time_t now;
+ pg_time_t now;
/*-------
* Standby mode is implemented by a state machine:
*
- * 1. Read from archive (XLOG_FROM_ARCHIVE)
- * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
- * 3. Check trigger file
- * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
- * 5. Rescan timelines
- * 6. Sleep 5 seconds, and loop back to 1.
+ * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
+ * pg_xlog (XLOG_FROM_XLOG)
+ * 2. Check trigger file
+ * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+ * 4. Rescan timelines
+ * 5. Sleep 5 seconds, and loop back to 1.
*
* Failure to read from the current source advances the state machine to
- * the next state. In addition, successfully reading a file from pg_xlog
- * moves the state machine from state 2 back to state 1 (we always prefer
- * files in the archive over files in pg_xlog).
+ * the next state.
*
* 'currentSource' indicates the current state. There are no currentSource
* values for "check trigger", "rescan timelines", and "sleep" states,
for (;;)
{
- int oldSource = currentSource;
+ int oldSource = currentSource;
/*
* First check if we failed to read from the current source, and
switch (currentSource)
{
case XLOG_FROM_ARCHIVE:
- currentSource = XLOG_FROM_PG_XLOG;
- break;
-
case XLOG_FROM_PG_XLOG:
+
/*
- * Check to see if the trigger file exists. Note that we do
- * this only after failure, so when you create the trigger
- * file, we still finish replaying as much as we can from
- * archive and pg_xlog before failover.
+ * Check to see if the trigger file exists. Note that we
+ * do this only after failure, so when you create the
+ * trigger file, we still finish replaying as much as we
+ * can from archive and pg_xlog before failover.
*/
if (StandbyMode && CheckForStandbyTrigger())
{
}
/*
- * Not in standby mode, and we've now tried the archive and
- * pg_xlog.
+ * Not in standby mode, and we've now tried the archive
+ * and pg_xlog.
*/
if (!StandbyMode)
return false;
/*
- * If primary_conninfo is set, launch walreceiver to try to
- * stream the missing WAL.
+ * If primary_conninfo is set, launch walreceiver to try
+ * to stream the missing WAL.
*
* If fetching_ckpt is TRUE, RecPtr points to the initial
* checkpoint location. In that case, we use RedoStartLSN
*/
if (PrimaryConnInfo)
{
- XLogRecPtr ptr;
- TimeLineID tli;
+ XLogRecPtr ptr;
+ TimeLineID tli;
if (fetching_ckpt)
{
}
else
{
- ptr = RecPtr;
+ ptr = tliRecPtr;
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
if (curFileTLI > 0 && tli < curFileTLI)
tli, curFileTLI);
}
curFileTLI = tli;
- RequestXLogStreaming(curFileTLI, ptr, PrimaryConnInfo);
+ RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+ PrimarySlotName);
+ receivedUpto = 0;
}
+
/*
- * Move to XLOG_FROM_STREAM state in either case. We'll get
- * immediate failure if we didn't launch walreceiver, and
- * move on to the next state.
+ * Move to XLOG_FROM_STREAM state in either case. We'll
+ * get immediate failure if we didn't launch walreceiver,
+ * and move on to the next state.
*/
currentSource = XLOG_FROM_STREAM;
break;
case XLOG_FROM_STREAM:
+
/*
- * Failure while streaming. Most likely, we got here because
- * streaming replication was terminated, or promotion was
- * triggered. But we also get here if we find an invalid
- * record in the WAL streamed from master, in which case
- * something is seriously wrong. There's little chance that
- * the problem will just go away, but PANIC is not good for
- * availability either, especially in hot standby mode. So,
- * we treat that the same as disconnection, and retry from
- * archive/pg_xlog again. The WAL in the archive should be
- * identical to what was streamed, so it's unlikely that it
- * helps, but one can hope...
+ * Failure while streaming. Most likely, we got here
+ * because streaming replication was terminated, or
+ * promotion was triggered. But we also get here if we
+ * find an invalid record in the WAL streamed from master,
+ * in which case something is seriously wrong. There's
+ * little chance that the problem will just go away, but
+ * PANIC is not good for availability either, especially
+ * in hot standby mode. So, we treat that the same as
+ * disconnection, and retry from archive/pg_xlog again.
+ * The WAL in the archive should be identical to what was
+ * streamed, so it's unlikely that it helps, but one can
+ * hope...
*/
+
/*
* Before we leave XLOG_FROM_STREAM state, make sure that
* walreceiver is not active, so that it won't overwrite
}
/*
- * XLOG_FROM_STREAM is the last state in our state machine,
- * so we've exhausted all the options for obtaining the
- * requested WAL. We're going to loop back and retry from
- * the archive, but if it hasn't been long since last
- * attempt, sleep 5 seconds to avoid busy-waiting.
+ * XLOG_FROM_STREAM is the last state in our state
+ * machine, so we've exhausted all the options for
+ * obtaining the requested WAL. We're going to loop back
+ * and retry from the archive, but if it hasn't been long
+ * since last attempt, sleep 5 seconds to avoid
+ * busy-waiting.
*/
now = (pg_time_t) time(NULL);
if ((now - last_fail_time) < 5)
else if (currentSource == XLOG_FROM_PG_XLOG)
{
/*
- * We just successfully read a file in pg_xlog. We prefer files
- * in the archive over ones in pg_xlog, so try the next file
- * again from the archive first.
+ * We just successfully read a file in pg_xlog. We prefer files in
+ * the archive over ones in pg_xlog, so try the next file again
+ * from the archive first.
*/
if (InArchiveRecovery)
currentSource = XLOG_FROM_ARCHIVE;
* Try to restore the file from archive, or read an existing
* file from pg_xlog.
*/
- readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
+ readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+ currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+ currentSource);
if (readFile >= 0)
return true; /* success! */
break;
case XLOG_FROM_STREAM:
- {
- bool havedata;
-
- /*
- * Check if WAL receiver is still active.
- */
- if (!WalRcvStreaming())
- {
- lastSourceFailed = true;
- break;
- }
-
- /*
- * Walreceiver is active, so see if new data has arrived.
- *
- * We only advance XLogReceiptTime when we obtain fresh WAL
- * from walreceiver and observe that we had already processed
- * everything before the most recent "chunk" that it flushed to
- * disk. In steady state where we are keeping up with the
- * incoming data, XLogReceiptTime will be updated on each cycle.
- * When we are behind, XLogReceiptTime will not advance, so the
- * grace time allotted to conflicting queries will decrease.
- */
- if (RecPtr < receivedUpto)
- havedata = true;
- else
{
- XLogRecPtr latestChunkStart;
+ bool havedata;
- receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
- if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
+ /*
+ * Check if WAL receiver is still active.
+ */
+ if (!WalRcvStreaming())
{
+ lastSourceFailed = true;
+ break;
+ }
+
+ /*
+ * Walreceiver is active, so see if new data has arrived.
+ *
+ * We only advance XLogReceiptTime when we obtain fresh
+ * WAL from walreceiver and observe that we had already
+ * processed everything before the most recent "chunk"
+ * that it flushed to disk. In steady state where we are
+ * keeping up with the incoming data, XLogReceiptTime will
+ * be updated on each cycle. When we are behind,
+ * XLogReceiptTime will not advance, so the grace time
+ * allotted to conflicting queries will decrease.
+ */
+ if (RecPtr < receivedUpto)
havedata = true;
- if (latestChunkStart <= RecPtr)
+ else
+ {
+ XLogRecPtr latestChunkStart;
+
+ receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
+ if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
{
- XLogReceiptTime = GetCurrentTimestamp();
- SetCurrentChunkStartTime(XLogReceiptTime);
+ havedata = true;
+ if (latestChunkStart <= RecPtr)
+ {
+ XLogReceiptTime = GetCurrentTimestamp();
+ SetCurrentChunkStartTime(XLogReceiptTime);
+ }
}
+ else
+ havedata = false;
}
- else
- havedata = false;
- }
- if (havedata)
- {
- /*
- * Great, streamed far enough. Open the file if it's not
- * open already. Also read the timeline history file if
- * we haven't initialized timeline history yet; it should
- * be streamed over and present in pg_xlog by now. Use
- * XLOG_FROM_STREAM so that source info is set correctly
- * and XLogReceiptTime isn't changed.
- */
- if (readFile < 0)
+ if (havedata)
{
- if (!expectedTLEs)
- expectedTLEs = readTimeLineHistory(receiveTLI);
- readFile = XLogFileRead(readSegNo, PANIC,
- receiveTLI,
- XLOG_FROM_STREAM, false);
- Assert(readFile >= 0);
+ /*
+ * Great, streamed far enough. Open the file if it's
+ * not open already. Also read the timeline history
+ * file if we haven't initialized timeline history
+ * yet; it should be streamed over and present in
+ * pg_xlog by now. Use XLOG_FROM_STREAM so that
+ * source info is set correctly and XLogReceiptTime
+ * isn't changed.
+ */
+ if (readFile < 0)
+ {
+ if (!expectedTLEs)
+ expectedTLEs = readTimeLineHistory(receiveTLI);
+ readFile = XLogFileRead(readSegNo, PANIC,
+ receiveTLI,
+ XLOG_FROM_STREAM, false);
+ Assert(readFile >= 0);
+ }
+ else
+ {
+ /* just make sure source info is correct... */
+ readSource = XLOG_FROM_STREAM;
+ XLogReceiptSource = XLOG_FROM_STREAM;
+ return true;
+ }
+ break;
}
- else
+
+ /*
+ * Data not here yet. Check for trigger, then wait for
+ * walreceiver to wake us up when new WAL arrives.
+ */
+ if (CheckForStandbyTrigger())
{
- /* just make sure source info is correct... */
- readSource = XLOG_FROM_STREAM;
- XLogReceiptSource = XLOG_FROM_STREAM;
- return true;
+ /*
+ * Note that we don't "return false" immediately here.
+ * After being triggered, we still want to replay all
+ * the WAL that was already streamed. It's in pg_xlog
+ * now, so we just treat this as a failure, and the
+ * state machine will move on to replay the streamed
+ * WAL from pg_xlog, and then recheck the trigger and
+ * exit replay.
+ */
+ lastSourceFailed = true;
+ break;
}
- break;
- }
- /*
- * Data not here yet. Check for trigger, then wait for
- * walreceiver to wake us up when new WAL arrives.
- */
- if (CheckForStandbyTrigger())
- {
/*
- * Note that we don't "return false" immediately here.
- * After being triggered, we still want to replay all the
- * WAL that was already streamed. It's in pg_xlog now, so
- * we just treat this as a failure, and the state machine
- * will move on to replay the streamed WAL from pg_xlog,
- * and then recheck the trigger and exit replay.
+ * Wait for more WAL to arrive. Time out after 5 seconds,
+ * like when polling the archive, to react to a trigger
+ * file promptly.
*/
- lastSourceFailed = true;
+ WaitLatch(&XLogCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT,
+ 5000L);
+ ResetLatch(&XLogCtl->recoveryWakeupLatch);
break;
}
- /*
- * Wait for more WAL to arrive. Time out after 5 seconds, like
- * when polling the archive, to react to a trigger file
- * promptly.
- */
- WaitLatch(&XLogCtl->recoveryWakeupLatch,
- WL_LATCH_SET | WL_TIMEOUT,
- 5000L);
- ResetLatch(&XLogCtl->recoveryWakeupLatch);
- break;
- }
-
default:
elog(ERROR, "unexpected WAL source %d", currentSource);
}
* process.
*/
HandleStartupProcInterrupts();
- } while (StandbyMode);
+ }
- return false;
+ return false; /* not reached */
}
/*
* in the current WAL page, previously read by XLogPageRead().
*
* 'emode' is the error mode that would be used to report a file-not-found
- * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
+ * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
* we're retrying the exact same record that we've tried previously, only
- * complain the first time to keep the noise down. However, we only do when
+ * complain the first time to keep the noise down. However, we only do when
* reading from pg_xlog, because we don't expect any invalid records in archive
* or in records streamed from master. Files in the archive should be complete,
* and we should never hit the end of WAL because we stop and wait for more WAL
if (IsPromoteTriggered())
{
/*
- * In 9.1 and 9.2 the postmaster unlinked the promote file
- * inside the signal handler. We now leave the file in place
- * and let the Startup process do the unlink. This allows
- * Startup to know whether we're doing fast or normal
- * promotion. Fast promotion takes precedence.
+ * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
+ * signal handler. It now leaves the file in place and lets the
+ * Startup process do the unlink. This allows Startup to know whether
+ * it should create a full checkpoint before starting up (fallback
+ * mode). Fast promotion takes precedence.
*/
- if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
{
- unlink(FAST_PROMOTE_SIGNAL_FILE);
unlink(PROMOTE_SIGNAL_FILE);
+ unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
fast_promote = true;
}
- else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
{
- unlink(PROMOTE_SIGNAL_FILE);
+ unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
fast_promote = false;
}
- /*
- * We only look for fast promote via the pg_ctl promote option.
- * It would be possible to extend trigger file support for the
- * fast promotion option but that wouldn't be backwards compatible
- * anyway and we're looking to focus further work on the promote
- * option as the right way to signal end of recovery.
- */
- if (fast_promote)
- ereport(LOG,
- (errmsg("received fast promote request")));
- else
- ereport(LOG,
- (errmsg("received promote request")));
+ ereport(LOG, (errmsg("received promote request")));
ResetPromoteTriggered();
triggered = true;
(errmsg("trigger file found: %s", TriggerFile)));
unlink(TriggerFile);
triggered = true;
+ fast_promote = true;
return true;
}
+ else if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat trigger file \"%s\": %m",
+ TriggerFile)));
+
return false;
}
struct stat stat_buf;
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
- stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
return true;
return false;