/*-------------------------------------------------------------------------
*
* xlog.c
- * PostgreSQL transaction log manager
+ * PostgreSQL write-ahead log manager
*
*
- * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/xlog.c
/* User-settable parameters */
-int max_wal_size = 64; /* 1 GB */
-int min_wal_size = 5; /* 80 MB */
+int max_wal_size_mb = 1024; /* 1 GB */
+int min_wal_size_mb = 80; /* 80 MB */
int wal_keep_segments = 0;
int XLOGbuffers = -1;
int XLogArchiveTimeout = 0;
bool XLOG_DEBUG = false;
#endif
+int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
+
/*
* Number of WAL insertion locks to use. A higher value allows more insertions
* to happen concurrently, but adds some CPU overhead to flushing the WAL,
EXCLUSIVE_BACKUP_STOPPING
} ExclusiveBackupState;
+/*
+ * Session status of running backup, used for sanity checks in SQL-callable
+ * functions to start and stop backups.
+ */
+static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
+
/*
* Shared state data for WAL insertion.
*/
bool fullPageWrites;
/*
- * exclusiveBackupState indicates the state of an exclusive backup
- * (see comments of ExclusiveBackupState for more details).
- * nonExclusiveBackups is a counter indicating the number of streaming
- * base backups currently in progress. forcePageWrites is set to true
- * when either of these is non-zero. lastBackupStart is the latest
- * checkpoint redo location used as a starting point for an online
- * backup.
+ * exclusiveBackupState indicates the state of an exclusive backup (see
+ * comments of ExclusiveBackupState for more details). nonExclusiveBackups
+ * is a counter indicating the number of streaming base backups currently
+ * in progress. forcePageWrites is set to true when either of these is
+ * non-zero. lastBackupStart is the latest checkpoint redo location used
+ * as a starting point for an online backup.
*/
ExclusiveBackupState exclusiveBackupState;
int nonExclusiveBackups;
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
- XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG
- * segment */
+ XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
XLogRecPtr unloggedLSN;
(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
/*
- * These are the number of bytes in a WAL page and segment usable for WAL data.
+ * These are the number of bytes in a WAL page usable for WAL data.
*/
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
-#define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
+
+/* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
+#define ConvertToXSegs(x, segsize) \
+ (x / ((segsize) / (1024 * 1024)))
+
+/* The number of bytes in a WAL segment usable for WAL data. */
+static int UsableBytesInSegment;
/*
* Private, possibly out-of-date copy of shared LogwrtResult.
static XLogSegNo readSegNo = 0;
static uint32 readOff = 0;
static uint32 readLen = 0;
-static XLogSource readSource = 0; /* XLOG_FROM_* code */
+static XLogSource readSource = 0; /* XLOG_FROM_* code */
/*
* Keeps track of which source we're currently reading from. This is
* XLogReceiptSource tracks where we last successfully read some WAL.)
*/
static TimestampTz XLogReceiptTime = 0;
-static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
+static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr; /* start of last record read */
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
-static XLogRecPtr minRecoveryPoint; /* local copy of
- * ControlFile->minRecoveryPoint */
+/*
+ * Local copies of equivalent fields in the control file. When running
+ * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
+ * expect to replay all the WAL available, and updateMinRecoveryPoint is
+ * switched to false to prevent any updates while replaying records.
+ * Those values are kept consistent as long as crash recovery runs.
+ */
+static XLogRecPtr minRecoveryPoint;
static TimeLineID minRecoveryPointTLI;
static bool updateMinRecoveryPoint = true;
bool find_free, XLogSegNo max_segno,
bool use_lock);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
- int source, bool notexistOk);
+ int source, bool notfoundOk);
static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
static void XLogFileClose(void);
static void PreallocXlogFiles(XLogRecPtr endptr);
-static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
-static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
+static void RemoveTempXlogFiles(void);
+static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
+static void RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
static void UpdateLastRemovedPtr(char *filename);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
*/
if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
{
- int lockno = holdingAllLocks ? 0 : MyLockNo;
+ int lockno = holdingAllLocks ? 0 : MyLockNo;
WALInsertLocks[lockno].l.lastImportantAt = StartPos;
}
EndPos = StartPos + SizeOfXLogRecord;
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
{
- if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
+ uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
+
+ if (offset == EndPos % XLOG_BLCKSZ)
EndPos += SizeOfXLogLongPHD;
else
EndPos += SizeOfXLogShortPHD;
appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
if (!debug_reader)
- debug_reader = XLogReaderAllocate(NULL, NULL);
+ debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
if (!debug_reader)
{
startbytepos = Insert->CurrBytePos;
ptr = XLogBytePosToEndRecPtr(startbytepos);
- if (ptr % XLOG_SEG_SIZE == 0)
+ if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
{
SpinLockRelease(&Insert->insertpos_lck);
*EndPos = *StartPos = ptr;
*StartPos = XLogBytePosToRecPtr(startbytepos);
*EndPos = XLogBytePosToEndRecPtr(endbytepos);
- segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
- if (segleft != XLOG_SEG_SIZE)
+ segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
+ if (segleft != wal_segment_size)
{
/* consume the rest of the segment */
*EndPos += segleft;
*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
- Assert((*EndPos) % XLOG_SEG_SIZE == 0);
+ Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
/*
* If the block LSN is already ahead of this WAL record, we can't
- * expect contents to match. This can happen if recovery is restarted.
+ * expect contents to match. This can happen if recovery is
+ * restarted.
*/
if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
continue;
if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
{
elog(FATAL,
- "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
rnode.spcNode, rnode.dbNode, rnode.relNode,
forknum, blkno);
}
pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
/* skip over the page header */
- if (CurrPos % XLogSegSize == 0)
+ if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
{
CurrPos += SizeOfXLogLongPHD;
currpos += SizeOfXLogLongPHD;
/*
* If this was an xlog-switch, it's not enough to write the switch record,
- * we also have to consume all the remaining space in the WAL segment. We
- * have already reserved it for us, but we still need to make sure it's
- * allocated and zeroed in the WAL buffers so that when the caller (or
- * someone else) does XLogWrite(), it can really write out all the zeros.
+ * we also have to consume all the remaining space in the WAL segment. We
+ * have already reserved that space, but we need to actually fill it.
*/
- if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
+ if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
{
/* An xlog-switch record doesn't contain any data besides the header */
Assert(write_len == SizeOfXLogRecord);
- /*
- * We do this one page at a time, to make sure we don't deadlock
- * against ourselves if wal_buffers < XLOG_SEG_SIZE.
- */
- Assert(EndPos % XLogSegSize == 0);
+ /* Assert that we did reserve the right amount of space */
+ Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
- /* Use up all the remaining space on the first page */
+ /* Use up all the remaining space on the current page */
CurrPos += freespace;
+ /*
+ * Cause all remaining pages in the segment to be flushed, leaving the
+ * XLog position where it should be, at the start of the next segment.
+ * We do this one page at a time, to make sure we don't deadlock
+ * against ourselves if wal_buffers < wal_segment_size.
+ */
while (CurrPos < EndPos)
{
- /* initialize the next page (if not initialized already) */
- WALInsertLockUpdateInsertingAt(CurrPos);
- AdvanceXLInsertBuffer(CurrPos, false);
+ /*
+ * The minimal action to flush the page would be to call
+ * WALInsertLockUpdateInsertingAt(CurrPos) followed by
+ * AdvanceXLInsertBuffer(...). The page would be left initialized
+ * mostly to zeros, except for the page header (always the short
+ * variant, as this is never a segment's first page).
+ *
+ * The large vistas of zeros are good for compressibility, but the
+ * headers interrupting them every XLOG_BLCKSZ (with values that
+ * differ from page to page) are not. The effect varies with
+ * compression tool, but bzip2 for instance compresses about an
+ * order of magnitude worse if those headers are left in place.
+ *
+ * Rather than complicating AdvanceXLInsertBuffer itself (which is
+ * called in heavily-loaded circumstances as well as this lightly-
+ * loaded one) with variant behavior, we just use GetXLogBuffer
+ * (which itself calls the two methods we need) to get the pointer
+ * and zero most of the page. Then we just zero the page header.
+ */
+ currpos = GetXLogBuffer(CurrPos);
+ MemSet(currpos, 0, SizeOfXLogShortPHD);
+
CurrPos += XLOG_BLCKSZ;
}
}
* WALInsertLockAcquireExclusive.
*/
LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
- &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
+ &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
insertingAt);
}
else
* the page header.
*/
if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
- ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
+ XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
initializedUpto = ptr - SizeOfXLogShortPHD;
else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
- ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
+ XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
initializedUpto = ptr - SizeOfXLogLongPHD;
else
initializedUpto = ptr;
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
- XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
- XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
uint32 offset;
uint64 result;
- XLByteToSeg(ptr, fullsegs);
+ XLByteToSeg(ptr, fullsegs, wal_segment_size);
- fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
+ fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
offset = ptr % XLOG_BLCKSZ;
if (fullpages == 0)
{
result = fullsegs * UsableBytesInSegment +
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
- (fullpages - 1) * UsableBytesInPage; /* full pages */
+ (fullpages - 1) * UsableBytesInPage; /* full pages */
if (offset > 0)
{
Assert(offset >= SizeOfXLogShortPHD);
/*
* If online backup is not in progress, mark the header to indicate
- * that* WAL records beginning in this page have removable backup
+ * that WAL records beginning in this page have removable backup
* blocks. This allows the WAL archiver to know whether it is safe to
* compress archived WAL data by transforming full-block records into
* the non-full-block format. It is sufficient to record this at the
/*
* If first page of an XLOG segment file, make it a long header.
*/
- if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
+ if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
{
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
NewLongPage->xlp_sysid = ControlFile->system_identifier;
- NewLongPage->xlp_seg_size = XLogSegSize;
+ NewLongPage->xlp_seg_size = wal_segment_size;
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
NewPage->xlp_info |= XLP_LONG_HEADER;
}
}
/*
- * Calculate CheckPointSegments based on max_wal_size and
+ * Calculate CheckPointSegments based on max_wal_size_mb and
* checkpoint_completion_target.
*/
static void
/*-------
* Calculate the distance at which to trigger a checkpoint, to avoid
- * exceeding max_wal_size. This is based on two assumptions:
+ * exceeding max_wal_size_mb. This is based on two assumptions:
*
- * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
+ * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
+ * WAL for two checkpoint cycles to allow us to recover from the
+ * secondary checkpoint if the first checkpoint failed, though we
+ * only did this on the master anyway, not on standby. Keeping just
+ * one checkpoint simplifies processing and reduces disk space in
+ * many smaller databases.)
* b) during checkpoint, we consume checkpoint_completion_target *
* number of segments consumed between checkpoints.
*-------
*/
- target = (double) max_wal_size / (2.0 + CheckPointCompletionTarget);
+ target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
+ (1.0 + CheckPointCompletionTarget);
/* round down */
CheckPointSegments = (int) target;
void
assign_max_wal_size(int newval, void *extra)
{
- max_wal_size = newval;
+ max_wal_size_mb = newval;
CalculateCheckpointSegments();
}
* XLOG segments? Returns the highest segment that should be preallocated.
*/
static XLogSegNo
-XLOGfileslop(XLogRecPtr PriorRedoPtr)
+XLOGfileslop(XLogRecPtr RedoRecPtr)
{
XLogSegNo minSegNo;
XLogSegNo maxSegNo;
XLogSegNo recycleSegNo;
/*
- * Calculate the segment numbers that min_wal_size and max_wal_size
+ * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
* correspond to. Always recycle enough segments to meet the minimum, and
* remove enough segments to stay below the maximum.
*/
- minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
- maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
+ minSegNo = RedoRecPtr / wal_segment_size +
+ ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
+ maxSegNo = RedoRecPtr / wal_segment_size +
+ ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
/*
* Between those limits, recycle enough segments to get us through to the
* To estimate where the next checkpoint will finish, assume that the
* system runs steadily consuming CheckPointDistanceEstimate bytes between
* every checkpoint.
- *
- * The reason this calculation is done from the prior checkpoint, not the
- * one that just finished, is that this behaves better if some checkpoint
- * cycles are abnormally short, like if you perform a manual checkpoint
- * right after a timed one. The manual checkpoint will make almost a full
- * cycle's worth of WAL segments available for recycling, because the
- * segments from the prior's prior, fully-sized checkpoint cycle are no
- * longer needed. However, the next checkpoint will make only few segments
- * available for recycling, the ones generated between the timed
- * checkpoint and the manual one right after that. If at the manual
- * checkpoint we only retained enough segments to get us to the next timed
- * one, and removed the rest, then at the next checkpoint we would not
- * have enough segments around for recycling, to get us to the checkpoint
- * after that. Basing the calculations on the distance from the prior redo
- * pointer largely fixes that problem.
- */
- distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
+ */
+ distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
/* add 10% for good measure. */
distance *= 1.10;
- recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
+ recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + distance) /
+ wal_segment_size);
if (recycleSegNo < minSegNo)
recycleSegNo = minSegNo;
{
XLogSegNo old_segno;
- XLByteToSeg(RedoRecPtr, old_segno);
+ XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
return true;
/*
* Write and/or fsync the log at least as far as WriteRqst indicates.
*
- * If flexible == TRUE, we don't have to write as far as WriteRqst, but
+ * If flexible == true, we don't have to write as far as WriteRqst, but
* may stop at any convenient boundary (such as a cache or logfile boundary).
* This option allows us to avoid uselessly issuing multiple writes when a
* single one would do.
LogwrtResult.Write = EndPtr;
ispartialpage = WriteRqst.Write < LogwrtResult.Write;
- if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
+ if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
{
/*
* Switch to new logfile segment. We cannot have any pending
Assert(npages == 0);
if (openLogFile >= 0)
XLogFileClose();
- XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
/* create/use new log file */
use_existent = true;
/* Make sure we have the current logfile open */
if (openLogFile < 0)
{
- XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo);
openLogOff = 0;
}
{
/* first of group */
startidx = curridx;
- startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
+ startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
+ wal_segment_size);
}
npages++;
last_iteration = WriteRqst.Write <= LogwrtResult.Write;
finishing_seg = !ispartialpage &&
- (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
+ (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
if (last_iteration ||
curridx == XLogCtl->XLogCacheBlck ||
if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not seek in log file %s to offset %u: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo),
- startoffset)));
+ errmsg("could not seek in log file %s to offset %u: %m",
+ XLogFileNameP(ThisTimeLineID, openLogSegNo),
+ startoffset)));
openLogOff = startoffset;
}
do
{
errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = write(openLogFile, from, nleft);
+ pgstat_report_wait_end();
if (written <= 0)
{
if (errno == EINTR)
(errcode_for_file_access(),
errmsg("could not write to log file %s "
"at offset %u, length %zu: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo),
+ XLogFileNameP(ThisTimeLineID, openLogSegNo),
openLogOff, nbytes)));
}
nleft -= written;
/* signal that we need to wakeup walsenders later */
WalSndWakeupRequest();
- LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
+ LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
if (XLogArchivingActive())
XLogArchiveNotifySeg(openLogSegNo);
sync_method != SYNC_METHOD_OPEN_DSYNC)
{
if (openLogFile >= 0 &&
- !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
+ !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
XLogFileClose();
if (openLogFile < 0)
{
- XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo);
openLogOff = 0;
}
if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
return;
+ /*
+ * An invalid minRecoveryPoint means that we need to recover all the WAL,
+ * i.e., we're doing crash recovery. We never modify the control file's
+ * value in that case, so we can short-circuit future checks here too. The
+ * local values of minRecoveryPoint and minRecoveryPointTLI should not be
+ * updated until crash recovery finishes. We only do this for the startup
+ * process as it should not update its own reference of minRecoveryPoint
+ * until it has finished crash recovery to make sure that all WAL
+ * available is replayed in this case. This also saves from extra locks
+ * taken on the control file from the startup process.
+ */
+ if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+ {
+ updateMinRecoveryPoint = false;
+ return;
+ }
+
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
/* update local copy */
minRecoveryPoint = ControlFile->minRecoveryPoint;
minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
- /*
- * An invalid minRecoveryPoint means that we need to recover all the WAL,
- * i.e., we're doing crash recovery. We never modify the control file's
- * value in that case, so we can short-circuit future checks here too.
- */
- if (minRecoveryPoint == 0)
+ if (XLogRecPtrIsInvalid(minRecoveryPoint))
updateMinRecoveryPoint = false;
else if (force || minRecoveryPoint < lsn)
{
if (!force && newMinRecoveryPoint < lsn)
elog(WARNING,
- "xlog min recovery request %X/%X is past current point %X/%X",
+ "xlog min recovery request %X/%X is past current point %X/%X",
(uint32) (lsn >> 32), (uint32) lsn,
(uint32) (newMinRecoveryPoint >> 32),
(uint32) newMinRecoveryPoint);
minRecoveryPointTLI = newMinRecoveryPointTLI;
ereport(DEBUG2,
- (errmsg("updated min recovery point to %X/%X on timeline %u",
- (uint32) (minRecoveryPoint >> 32),
- (uint32) minRecoveryPoint,
- newMinRecoveryPointTLI)));
+ (errmsg("updated min recovery point to %X/%X on timeline %u",
+ (uint32) (minRecoveryPoint >> 32),
+ (uint32) minRecoveryPoint,
+ newMinRecoveryPointTLI)));
}
}
LWLockRelease(ControlFileLock);
elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
(uint32) (record >> 32), (uint32) record,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
START_CRIT_SECTION();
*/
if (LogwrtResult.Flush < record)
elog(ERROR,
- "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
+ "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
(uint32) (record >> 32), (uint32) record,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
}
/*
*
* This routine is invoked periodically by the background walwriter process.
*
- * Returns TRUE if there was any work to do, even if we skipped flushing due
+ * Returns true if there was any work to do, even if we skipped flushing due
* to wal_writer_delay/wal_writer_flush_after.
*/
bool
{
if (openLogFile >= 0)
{
- if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
+ if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
{
XLogFileClose();
}
(uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
(uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
- (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
+ (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
START_CRIT_SECTION();
*/
if (RecoveryInProgress())
{
- /* Quick exit if already known updated */
+ /*
+ * An invalid minRecoveryPoint means that we need to recover all the
+ * WAL, i.e., we're doing crash recovery. We never modify the control
+ * file's value in that case, so we can short-circuit future checks
+ * here too. This triggers a quick exit path for the startup process,
+ * which cannot update its local copy of minRecoveryPoint as long as
+ * it has not replayed all WAL available when doing crash recovery.
+ */
+ if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+ updateMinRecoveryPoint = false;
+
+ /* Quick exit if already known to be updated or cannot be updated */
if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
return false;
LWLockRelease(ControlFileLock);
/*
- * An invalid minRecoveryPoint means that we need to recover all the
- * WAL, i.e., we're doing crash recovery. We never modify the control
- * file's value in that case, so we can short-circuit future checks
- * here too.
+ * Check minRecoveryPoint for any other process than the startup
+ * process doing crash recovery, which should not update the control
+ * file value if crash recovery is still running.
*/
- if (minRecoveryPoint == 0)
+ if (XLogRecPtrIsInvalid(minRecoveryPoint))
updateMinRecoveryPoint = false;
/* check again */
*
* log, seg: identify segment to be created/opened.
*
- * *use_existent: if TRUE, OK to use a pre-existing file (else, any
- * pre-existing file will be deleted). On return, TRUE if a pre-existing
+ * *use_existent: if true, OK to use a pre-existing file (else, any
+ * pre-existing file will be deleted). On return, true if a pre-existing
* file was used.
*
- * use_lock: if TRUE, acquire ControlFileLock while moving file into
- * place. This should be TRUE except during bootstrap log creation. The
+ * use_lock: if true, acquire ControlFileLock while moving file into
+ * place. This should be true except during bootstrap log creation. The
* caller must *not* hold the lock at call.
*
* Returns FD of opened file.
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
- char zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
- char *zbuffer;
+ PGAlignedXLogBlock zbuffer;
XLogSegNo installed_segno;
XLogSegNo max_segno;
int fd;
int nbytes;
- XLogFilePath(path, ThisTimeLineID, logsegno);
+ XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
/*
* Try to use existent file (checkpoint maker may have created it already)
*/
if (*use_existent)
{
- fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
- S_IRUSR | S_IWUSR);
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
{
if (errno != ENOENT)
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
- fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
* fsync below) that all the indirect blocks are down on disk. Therefore,
* fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
* log file.
- *
- * Note: ensure the buffer is reasonably well-aligned; this may save a few
- * cycles transferring data to the kernel.
*/
- zbuffer = (char *) MAXALIGN(zbuffer_raw);
- memset(zbuffer, 0, XLOG_BLCKSZ);
- for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
+ memset(zbuffer.data, 0, XLOG_BLCKSZ);
+ for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
{
errno = 0;
- if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
+ pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
+ if ((int) write(fd, zbuffer.data, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
{
int save_errno = errno;
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
+ pgstat_report_wait_end();
}
+ pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0)
{
+ int save_errno = errno;
+
close(fd);
+ errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
}
+ pgstat_report_wait_end();
if (close(fd))
ereport(ERROR,
*use_existent = false;
/* Now open original target segment (might not be file I just made) */
- fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
- S_IRUSR | S_IWUSR);
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
- char buffer[XLOG_BLCKSZ];
+ PGAlignedXLogBlock buffer;
int srcfd;
int fd;
int nbytes;
/*
* Open the source file
*/
- XLogFilePath(path, srcTLI, srcsegno);
- srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+ XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
+ srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
if (srcfd < 0)
ereport(ERROR,
(errcode_for_file_access(),
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
- fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
/*
* Do the data copying.
*/
- for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
+ for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
{
int nread;
* zeros.
*/
if (nread < sizeof(buffer))
- memset(buffer, 0, sizeof(buffer));
+ memset(buffer.data, 0, sizeof(buffer));
if (nread > 0)
{
+ int r;
+
if (nread > sizeof(buffer))
nread = sizeof(buffer);
- errno = 0;
- if (read(srcfd, buffer, nread) != nread)
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
+ r = read(srcfd, buffer.data, nread);
+ if (r != nread)
{
- if (errno != 0)
+ if (r < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
path)));
else
ereport(ERROR,
- (errmsg("not enough data in file \"%s\"",
- path)));
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ path, r, (Size) nread)));
}
+ pgstat_report_wait_end();
}
errno = 0;
- if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
+ if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
{
int save_errno = errno;
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
+ pgstat_report_wait_end();
}
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
if (pg_fsync(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
+ pgstat_report_wait_end();
if (CloseTransientFile(fd))
ereport(ERROR,
* filename while it's being created) and to recycle an old segment.
*
* *segno: identify segment to install as (or first possible target).
- * When find_free is TRUE, this is modified on return to indicate the
+ * When find_free is true, this is modified on return to indicate the
* actual installation location or last segment searched.
*
* tmppath: initial name of file to install. It will be renamed into place.
*
- * find_free: if TRUE, install the new segment at the first empty segno
- * number at or after the passed numbers. If FALSE, install the new segment
+ * find_free: if true, install the new segment at the first empty segno
+ * number at or after the passed numbers. If false, install the new segment
* exactly where specified, deleting any existing segment file there.
*
* max_segno: maximum segment number to install the new file as. Fail if no
* free slot is found between *segno and max_segno. (Ignored when find_free
- * is FALSE.)
+ * is false.)
*
- * use_lock: if TRUE, acquire ControlFileLock while moving file into
- * place. This should be TRUE except during bootstrap log creation. The
+ * use_lock: if true, acquire ControlFileLock while moving file into
+ * place. This should be true except during bootstrap log creation. The
* caller must *not* hold the lock at call.
*
- * Returns TRUE if the file was installed successfully. FALSE indicates that
+ * Returns true if the file was installed successfully. false indicates that
* max_segno limit was exceeded, or an error occurred while renaming the
* file into place.
*/
char path[MAXPGPATH];
struct stat stat_buf;
- XLogFilePath(path, ThisTimeLineID, *segno);
+ XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
/*
* We want to be sure that only one process does this at a time.
if (!find_free)
{
/* Force installation: get rid of any pre-existing segment file */
- unlink(path);
+ durable_unlink(path, DEBUG1);
}
else
{
return false;
}
(*segno)++;
- XLogFilePath(path, ThisTimeLineID, *segno);
+ XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
}
}
char path[MAXPGPATH];
int fd;
- XLogFilePath(path, ThisTimeLineID, segno);
+ XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
- fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
- S_IRUSR | S_IWUSR);
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not open transaction log file \"%s\": %m", path)));
+ errmsg("could not open file \"%s\": %m", path)));
return fd;
}
char path[MAXPGPATH];
int fd;
- XLogFileName(xlogfname, tli, segno);
+ XLogFileName(xlogfname, tli, segno, wal_segment_size);
switch (source)
{
restoredFromArchive = RestoreArchivedFile(path, xlogfname,
"RECOVERYXLOG",
- XLogSegSize,
+ wal_segment_size,
InRedo);
if (!restoredFromArchive)
return -1;
case XLOG_FROM_PG_WAL:
case XLOG_FROM_STREAM:
- XLogFilePath(path, tli, segno);
+ XLogFilePath(path, tli, segno, wal_segment_size);
restoredFromArchive = false;
break;
snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
}
- fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+ fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
if (fd >= 0)
{
/* Success! */
}
/* Couldn't find it. For simplicity, complain about front timeline */
- XLogFilePath(path, recoveryTargetTLI, segno);
+ XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
errno = ENOENT;
ereport(emode,
(errcode_for_file_access(),
if (close(openLogFile))
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not close log file %s: %m",
+ errmsg("could not close file \"%s\": %m",
XLogFileNameP(ThisTimeLineID, openLogSegNo))));
openLogFile = -1;
}
XLogSegNo _logSegNo;
int lf;
bool use_existent;
+ uint64 offset;
- XLByteToPrevSeg(endptr, _logSegNo);
- if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
+ XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
+ offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
+ if (offset >= (uint32) (0.75 * wal_segment_size))
{
_logSegNo++;
use_existent = true;
* existed while the server has been running, as this function always
* succeeds if no WAL segments have been removed since startup.
* 'tli' is only used in the error message.
+ *
+ * Note: this function guarantees to keep errno unchanged on return.
+ * This supports callers that use this to possibly deliver a better
+ * error message about a missing file, while still being able to throw
+ * a normal file-access error afterwards, if this does return.
*/
void
CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
{
+ int save_errno = errno;
XLogSegNo lastRemovedSegNo;
SpinLockAcquire(&XLogCtl->info_lck);
{
char filename[MAXFNAMELEN];
- XLogFileName(filename, tli, segno);
+ XLogFileName(filename, tli, segno, wal_segment_size);
+ errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
filename)));
}
+ errno = save_errno;
}
/*
uint32 tli;
XLogSegNo segno;
- XLogFromFileName(filename, &tli, &segno);
+ XLogFromFileName(filename, &tli, &segno, wal_segment_size);
SpinLockAcquire(&XLogCtl->info_lck);
if (segno > XLogCtl->lastRemovedSegNo)
SpinLockRelease(&XLogCtl->info_lck);
}
+/*
+ * Remove all temporary log files in pg_wal
+ *
+ * This is called at the beginning of recovery after a previous crash,
+ * at a point where no other processes write fresh WAL data.
+ */
+static void
+RemoveTempXlogFiles(void)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+
+ elog(DEBUG2, "removing all temporary WAL segments");
+
+ xldir = AllocateDir(XLOGDIR);
+ while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+ {
+ char path[MAXPGPATH];
+
+ if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
+ unlink(path);
+ elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+ }
+ FreeDir(xldir);
+}
+
/*
* Recycle or remove all log files older or equal to passed segno.
*
- * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
- * redo pointer of the previous checkpoint. These are used to determine
+ * endptr is current (or recent) end of xlog, and RedoRecPtr is the
+ * redo pointer of the last checkpoint. These are used to determine
* whether we want to recycle rather than delete no-longer-wanted log files.
*/
static void
-RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
+RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
{
DIR *xldir;
struct dirent *xlde;
char lastoff[MAXFNAMELEN];
- xldir = AllocateDir(XLOGDIR);
- if (xldir == NULL)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open transaction log directory \"%s\": %m",
- XLOGDIR)));
-
/*
* Construct a filename of the last segment to be kept. The timeline ID
* doesn't matter, we ignore that in the comparison. (During recovery,
* ThisTimeLineID isn't set, so we can't use that.)
*/
- XLogFileName(lastoff, 0, segno);
+ XLogFileName(lastoff, 0, segno, wal_segment_size);
elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
lastoff);
+ xldir = AllocateDir(XLOGDIR);
+
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
/* Ignore files that are not XLOG segments */
/* Update the last removed location in shared memory first */
UpdateLastRemovedPtr(xlde->d_name);
- RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
+ RemoveXlogFile(xlde->d_name, RedoRecPtr, endptr);
}
}
}
char switchseg[MAXFNAMELEN];
XLogSegNo endLogSegNo;
- XLByteToPrevSeg(switchpoint, endLogSegNo);
-
- xldir = AllocateDir(XLOGDIR);
- if (xldir == NULL)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open transaction log directory \"%s\": %m",
- XLOGDIR)));
+ XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
/*
* Construct a filename of the last segment to be kept.
*/
- XLogFileName(switchseg, newTLI, endLogSegNo);
+ XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
switchseg);
+ xldir = AllocateDir(XLOGDIR);
+
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
/* Ignore files that are not XLOG segments */
/*
* Recycle or remove a log file that's no longer needed.
*
- * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
- * redo pointer of the previous checkpoint. These are used to determine
+ * endptr is current (or recent) end of xlog, and RedoRecPtr is the
+ * redo pointer of the last checkpoint. These are used to determine
* whether we want to recycle rather than delete no-longer-wanted log files.
- * If PriorRedoRecPtr is not known, pass invalid, and the function will
- * recycle, somewhat arbitrarily, 10 future segments.
+ * If RedoRecPtr is not known, pass invalid, and the function will recycle,
+ * somewhat arbitrarily, 10 future segments.
*/
static void
-RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
+RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
{
char path[MAXPGPATH];
#ifdef WIN32
/*
* Initialize info about where to try to recycle to.
*/
- XLByteToPrevSeg(endptr, endlogSegNo);
- if (PriorRedoPtr == InvalidXLogRecPtr)
+ XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
+ if (RedoRecPtr == InvalidXLogRecPtr)
recycleSegNo = endlogSegNo + 10;
else
- recycleSegNo = XLOGfileslop(PriorRedoPtr);
+ recycleSegNo = XLOGfileslop(RedoRecPtr);
snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
true, recycleSegNo, true))
{
ereport(DEBUG2,
- (errmsg("recycled transaction log file \"%s\"",
+ (errmsg("recycled write-ahead log file \"%s\"",
segname)));
CheckpointStats.ckpt_segs_recycled++;
/* Needn't recheck that slot on future iterations */
int rc;
ereport(DEBUG2,
- (errmsg("removing transaction log file \"%s\"",
+ (errmsg("removing write-ahead log file \"%s\"",
segname)));
#ifdef WIN32
{
ereport(LOG,
(errcode_for_file_access(),
- errmsg("could not rename old transaction log file \"%s\": %m",
- path)));
+ errmsg("could not rename file \"%s\": %m",
+ path)));
return;
}
- rc = unlink(newpath);
+ rc = durable_unlink(newpath, LOG);
#else
- rc = unlink(path);
+ rc = durable_unlink(path, LOG);
#endif
if (rc != 0)
{
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not remove old transaction log file \"%s\": %m",
- path)));
+ /* Message already logged by durable_unlink() */
return;
}
CheckpointStats.ckpt_segs_removed++;
{
ereport(LOG,
(errmsg("creating missing WAL directory \"%s\"", path)));
- if (mkdir(path, S_IRWXU) < 0)
+ if (MakePGDirectory(path) < 0)
ereport(FATAL,
(errmsg("could not create missing directory \"%s\": %m",
path)));
{
DIR *xldir;
struct dirent *xlde;
- char path[MAXPGPATH];
+ char path[MAXPGPATH + sizeof(XLOGDIR)];
xldir = AllocateDir(XLOGDIR);
- if (xldir == NULL)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open transaction log directory \"%s\": %m",
- XLOGDIR)));
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
{
if (XLogArchiveCheckDone(xlde->d_name))
{
- ereport(DEBUG2,
- (errmsg("removing transaction log backup history file \"%s\"",
- xlde->d_name)));
- snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
+ elog(DEBUG2, "removing WAL backup history file \"%s\"",
+ xlde->d_name);
+ snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
unlink(path);
XLogArchiveCleanup(xlde->d_name);
}
/*
* Attempt to read an XLOG record.
*
- * If RecPtr is not NULL, try to read a record at that position. Otherwise
+ * If RecPtr is valid, try to read a record at that position. Otherwise
* try to read a record just after the last one previously read.
*
* If no valid record is available, returns NULL, or fails if emode is PANIC.
if (errormsg)
ereport(emode_for_corrupt_record(emode,
RecPtr ? RecPtr : EndRecPtr),
- (errmsg_internal("%s", errormsg) /* already translated */ ));
+ (errmsg_internal("%s", errormsg) /* already translated */ ));
}
/*
XLogSegNo segno;
int32 offset;
- XLByteToSeg(xlogreader->latestPagePtr, segno);
- offset = xlogreader->latestPagePtr % XLogSegSize;
- XLogFileName(fname, xlogreader->readPageTLI, segno);
+ XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
+ offset = XLogSegmentOffset(xlogreader->latestPagePtr,
+ wal_segment_size);
+ XLogFileName(fname, xlogreader->readPageTLI, segno,
+ wal_segment_size);
ereport(emode_for_corrupt_record(emode,
RecPtr ? RecPtr : EndRecPtr),
- (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
- xlogreader->latestPageTLI,
- fname,
- offset)));
+ (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+ xlogreader->latestPageTLI,
+ fname,
+ offset)));
record = NULL;
}
* pg_wal, so we are presumably now consistent.
*
* We require that there's at least some valid WAL present in
- * pg_wal, however (!fetch_ckpt). We could recover using the WAL
- * from the archive, even if pg_wal is completely empty, but we'd
- * have no idea how far we'd have to replay to reach consistency.
- * So err on the safe side and give up.
+ * pg_wal, however (!fetching_ckpt). We could recover using the
+ * WAL from the archive, even if pg_wal is completely empty, but
+ * we'd have no idea how far we'd have to replay to reach
+ * consistency. So err on the safe side and give up.
*/
if (!InArchiveRecovery && ArchiveRecoveryRequested &&
!fetching_ckpt)
minRecoveryPoint = ControlFile->minRecoveryPoint;
minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ /*
+ * The startup process can update its local copy of
+ * minRecoveryPoint from this point.
+ */
+ updateMinRecoveryPoint = true;
+
UpdateControlFile();
LWLockRelease(ControlFileLock);
WriteControlFile(void)
{
int fd;
- char buffer[PG_CONTROL_SIZE]; /* need not be aligned */
+ char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
+
+ /*
+ * Ensure that the size of the pg_control data structure is sane. See the
+ * comments for these symbols in pg_control.h.
+ */
+ StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
+ "pg_control is too large for atomic disk writes");
+ StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
+ "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
/*
* Initialize version and compatibility-check fields
ControlFile->blcksz = BLCKSZ;
ControlFile->relseg_size = RELSEG_SIZE;
ControlFile->xlog_blcksz = XLOG_BLCKSZ;
- ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
+ ControlFile->xlog_seg_size = wal_segment_size;
ControlFile->nameDataLen = NAMEDATALEN;
ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
FIN_CRC32C(ControlFile->crc);
/*
- * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
- * excess over sizeof(ControlFileData). This reduces the odds of
+ * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
+ * the excess over sizeof(ControlFileData). This reduces the odds of
* premature-EOF errors when reading pg_control. We'll still fail when we
* check the contents of the file, but hopefully with a more specific
* error than "couldn't read pg_control".
*/
- if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
- elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
-
- memset(buffer, 0, PG_CONTROL_SIZE);
+ memset(buffer, 0, PG_CONTROL_FILE_SIZE);
memcpy(buffer, ControlFile, sizeof(ControlFileData));
fd = BasicOpenFile(XLOG_CONTROL_FILE,
- O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not create control file \"%s\": %m",
+ errmsg("could not create file \"%s\": %m",
XLOG_CONTROL_FILE)));
errno = 0;
- if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
+ if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not write to control file: %m")));
+ errmsg("could not write to file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
}
+ pgstat_report_wait_end();
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
if (pg_fsync(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync control file: %m")));
+ errmsg("could not fsync file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ pgstat_report_wait_end();
if (close(fd))
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not close control file: %m")));
+ errmsg("could not close file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
}
static void
{
pg_crc32c crc;
int fd;
+ static char wal_segsz_str[20];
+ int r;
/*
* Read data...
*/
fd = BasicOpenFile(XLOG_CONTROL_FILE,
- O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ O_RDWR | PG_BINARY);
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not open control file \"%s\": %m",
+ errmsg("could not open file \"%s\": %m",
XLOG_CONTROL_FILE)));
- if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
- ereport(PANIC,
- (errcode_for_file_access(),
- errmsg("could not read from control file: %m")));
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
+ r = read(fd, ControlFile, sizeof(ControlFileData));
+ if (r != sizeof(ControlFileData))
+ {
+ if (r < 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ else
+ ereport(PANIC,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
+ }
+ pgstat_report_wait_end();
close(fd);
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
- " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
- ControlFile->pg_control_version, ControlFile->pg_control_version,
+ " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
+ ControlFile->pg_control_version, ControlFile->pg_control_version,
PG_CONTROL_VERSION, PG_CONTROL_VERSION),
errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
- " but the server was compiled with PG_CONTROL_VERSION %d.",
- ControlFile->pg_control_version, PG_CONTROL_VERSION),
+ " but the server was compiled with PG_CONTROL_VERSION %d.",
+ ControlFile->pg_control_version, PG_CONTROL_VERSION),
errhint("It looks like you need to initdb.")));
/* Now check the CRC. */
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
- " but the server was compiled with CATALOG_VERSION_NO %d.",
- ControlFile->catalog_version_no, CATALOG_VERSION_NO),
+ " but the server was compiled with CATALOG_VERSION_NO %d.",
+ ControlFile->catalog_version_no, CATALOG_VERSION_NO),
errhint("It looks like you need to initdb.")));
if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with MAXALIGN %d,"
- " but the server was compiled with MAXALIGN %d.",
- ControlFile->maxAlign, MAXIMUM_ALIGNOF),
+ errdetail("The database cluster was initialized with MAXALIGN %d,"
+ " but the server was compiled with MAXALIGN %d.",
+ ControlFile->maxAlign, MAXIMUM_ALIGNOF),
errhint("It looks like you need to initdb.")));
if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
ereport(FATAL,
if (ControlFile->blcksz != BLCKSZ)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with BLCKSZ %d,"
- " but the server was compiled with BLCKSZ %d.",
- ControlFile->blcksz, BLCKSZ),
+ errdetail("The database cluster was initialized with BLCKSZ %d,"
+ " but the server was compiled with BLCKSZ %d.",
+ ControlFile->blcksz, BLCKSZ),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->relseg_size != RELSEG_SIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
- " but the server was compiled with RELSEG_SIZE %d.",
- ControlFile->relseg_size, RELSEG_SIZE),
+ errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
+ " but the server was compiled with RELSEG_SIZE %d.",
+ ControlFile->relseg_size, RELSEG_SIZE),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
- " but the server was compiled with XLOG_BLCKSZ %d.",
- ControlFile->xlog_blcksz, XLOG_BLCKSZ),
- errhint("It looks like you need to recompile or initdb.")));
- if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
- ereport(FATAL,
- (errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
- " but the server was compiled with XLOG_SEG_SIZE %d.",
- ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
+ errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
+ " but the server was compiled with XLOG_BLCKSZ %d.",
+ ControlFile->xlog_blcksz, XLOG_BLCKSZ),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->nameDataLen != NAMEDATALEN)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with NAMEDATALEN %d,"
- " but the server was compiled with NAMEDATALEN %d.",
- ControlFile->nameDataLen, NAMEDATALEN),
+ errdetail("The database cluster was initialized with NAMEDATALEN %d,"
+ " but the server was compiled with NAMEDATALEN %d.",
+ ControlFile->nameDataLen, NAMEDATALEN),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
- " but the server was compiled with INDEX_MAX_KEYS %d.",
+ " but the server was compiled with INDEX_MAX_KEYS %d.",
ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
- " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
- ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
+ " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
+ ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->loblksize != LOBLKSIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with LOBLKSIZE %d,"
- " but the server was compiled with LOBLKSIZE %d.",
- ControlFile->loblksize, (int) LOBLKSIZE),
+ errdetail("The database cluster was initialized with LOBLKSIZE %d,"
+ " but the server was compiled with LOBLKSIZE %d.",
+ ControlFile->loblksize, (int) LOBLKSIZE),
errhint("It looks like you need to recompile or initdb.")));
#ifdef USE_FLOAT4_BYVAL
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
- " but the server was compiled with USE_FLOAT4_BYVAL."),
+ " but the server was compiled with USE_FLOAT4_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#else
if (ControlFile->float4ByVal != false)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
- " but the server was compiled without USE_FLOAT4_BYVAL."),
+ errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
+ " but the server was compiled without USE_FLOAT4_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#endif
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
- " but the server was compiled with USE_FLOAT8_BYVAL."),
+ " but the server was compiled with USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#else
if (ControlFile->float8ByVal != false)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
- errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
- " but the server was compiled without USE_FLOAT8_BYVAL."),
+ errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
+ " but the server was compiled without USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#endif
+ wal_segment_size = ControlFile->xlog_seg_size;
+
+ if (!IsValidWalSegSize(wal_segment_size))
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
+ "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
+ wal_segment_size,
+ wal_segment_size)));
+
+ snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
+ SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
+ PGC_S_OVERRIDE);
+
+ /* check and update variables dependent on wal_segment_size */
+ if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\".")));
+
+ if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\".")));
+
+ UsableBytesInSegment =
+ (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
+ (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
+
+ CalculateCheckpointSegments();
+
/* Make the initdb settings visible as GUC variables, too */
SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
PGC_INTERNAL, PGC_S_OVERRIDE);
FIN_CRC32C(ControlFile->crc);
fd = BasicOpenFile(XLOG_CONTROL_FILE,
- O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ O_RDWR | PG_BINARY);
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not open control file \"%s\": %m",
- XLOG_CONTROL_FILE)));
+ errmsg("could not open file \"%s\": %m", XLOG_CONTROL_FILE)));
errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
{
/* if write didn't set errno, assume problem is no disk space */
errno = ENOSPC;
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not write to control file: %m")));
+ errmsg("could not write to file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
}
+ pgstat_report_wait_end();
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
if (pg_fsync(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync control file: %m")));
+ errmsg("could not fsync file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ pgstat_report_wait_end();
if (close(fd))
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not close control file: %m")));
+ errmsg("could not close file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
}
/*
int xbuffers;
xbuffers = NBuffers / 32;
- if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
- xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
+ if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
+ xbuffers = (wal_segment_size / XLOG_BLCKSZ);
if (xbuffers < 8)
xbuffers = 8;
return xbuffers;
return true;
}
+/*
+ * Read the control file, set respective GUCs.
+ *
+ * This is to be called during startup, including a crash recovery cycle,
+ * unless in bootstrap mode, where no control file yet exists. As there's no
+ * usable shared memory yet (its sizing can depend on the contents of the
+ * control file!), first store the contents in local memory. XLOGShmemInit()
+ * will then copy it to shared memory later.
+ *
+ * reset just controls whether previous contents are to be expected (in the
+ * reset case, there's a dangling pointer into old shared memory), or not.
+ */
+void
+LocalProcessControlFile(bool reset)
+{
+ Assert(reset || ControlFile == NULL);
+ ControlFile = palloc(sizeof(ControlFileData));
+ ReadControlFile();
+}
+
/*
* Initialization of shared memory for XLOG
*/
foundXLog;
char *allocptr;
int i;
+ ControlFileData *localControlFile;
#ifdef WAL_DEBUG
}
#endif
- ControlFile = (ControlFileData *)
- ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
+
XLogCtl = (XLogCtlData *)
ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+ localControlFile = ControlFile;
+ ControlFile = (ControlFileData *)
+ ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
+
if (foundCFile || foundXLog)
{
/* both should be present or neither */
WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
"wal_insert");
+
+ if (localControlFile)
+ pfree(localControlFile);
return;
}
memset(XLogCtl, 0, sizeof(XLogCtlData));
+ /*
+ * Already have read control file locally, unless in bootstrap mode. Move
+ * contents into shared memory.
+ */
+ if (localControlFile)
+ {
+ memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
+ pfree(localControlFile);
+ }
+
/*
* Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
* multiple of the alignment for same, so no extra alignment padding is
/* WAL insertion locks. Ensure they're aligned to the full padded size */
allocptr += sizeof(WALInsertLockPadded) -
- ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
+ ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
(WALInsertLockPadded *) allocptr;
allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
SpinLockInit(&XLogCtl->info_lck);
SpinLockInit(&XLogCtl->ulsn_lck);
InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
-
- /*
- * If we are not in bootstrap mode, pg_control should already exist. Read
- * and validate it immediately (see comments in ReadControlFile() for the
- * reasons why).
- */
- if (!IsBootstrapProcessingMode())
- ReadControlFile();
}
/*
sysidentifier |= getpid() & 0xFFF;
/*
- * Generate a random nonce. This is used for authentication requests
- * that will fail because the user does not exist. The nonce is used to
- * create a genuine-looking password challenge for the non-existent user,
- * in lieu of an actual stored password.
+ * Generate a random nonce. This is used for authentication requests that
+ * will fail because the user does not exist. The nonce is used to create
+ * a genuine-looking password challenge for the non-existent user, in lieu
+ * of an actual stored password.
*/
if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
ereport(PANIC,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("could not generation secret authorization token")));
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not generate secret authorization token")));
/* First timeline ID is always 1 */
ThisTimeLineID = 1;
* segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
* used, so that we can use 0/0 to mean "before any valid WAL segment".
*/
- checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
+ checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
checkPoint.ThisTimeLineID = ThisTimeLineID;
checkPoint.PrevTimeLineID = ThisTimeLineID;
checkPoint.fullPageWrites = fullPageWrites;
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+ AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
- SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
/* Set up the XLOG page header */
page->xlp_magic = XLOG_PAGE_MAGIC;
page->xlp_info = XLP_LONG_HEADER;
page->xlp_tli = ThisTimeLineID;
- page->xlp_pageaddr = XLogSegSize;
+ page->xlp_pageaddr = wal_segment_size;
longpage = (XLogLongPageHeader) page;
longpage->xlp_sysid = sysidentifier;
- longpage->xlp_seg_size = XLogSegSize;
+ longpage->xlp_seg_size = wal_segment_size;
longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
/* Insert the initial checkpoint record */
record->xl_rmid = RM_XLOG_ID;
recptr += SizeOfXLogRecord;
/* fill the XLogRecordDataHeaderShort struct */
- *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
+ *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
*(recptr++) = sizeof(checkPoint);
memcpy(recptr, &checkPoint, sizeof(checkPoint));
recptr += sizeof(checkPoint);
/* Write the first page with the initial record */
errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
{
/* if write didn't set errno, assume problem is no disk space */
errno = ENOSPC;
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not write bootstrap transaction log file: %m")));
+ errmsg("could not write bootstrap write-ahead log file: %m")));
}
+ pgstat_report_wait_end();
+ pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
if (pg_fsync(openLogFile) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync bootstrap transaction log file: %m")));
+ errmsg("could not fsync bootstrap write-ahead log file: %m")));
+ pgstat_report_wait_end();
if (close(openLogFile))
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not close bootstrap transaction log file: %m")));
+ errmsg("could not close bootstrap write-ahead log file: %m")));
openLogFile = -1;
BootStrapMultiXact();
pfree(buffer);
+
+ /*
+ * Force control file to be read - in contrast to normal processing we'd
+ * otherwise never run the checks and GUC related initializations therein.
+ */
+ ReadControlFile();
}
static char *
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
- "recovery_target_action",
- item->value),
+ errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
+ "recovery_target_action",
+ item->value),
errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
ereport(DEBUG2,
}
if (rtli)
ereport(DEBUG2,
- (errmsg_internal("recovery_target_timeline = %u", rtli)));
+ (errmsg_internal("recovery_target_timeline = %u", rtli)));
else
ereport(DEBUG2,
- (errmsg_internal("recovery_target_timeline = latest")));
+ (errmsg_internal("recovery_target_timeline = latest")));
}
else if (strcmp(item->name, "recovery_target_xid") == 0)
{
if (errno == EINVAL || errno == ERANGE)
ereport(FATAL,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("recovery_target_xid is not a valid number: \"%s\"",
- item->value)));
+ errmsg("recovery_target_xid is not a valid number: \"%s\"",
+ item->value)));
ereport(DEBUG2,
(errmsg_internal("recovery_target_xid = %u",
recoveryTargetXid)));
{
recoveryTarget = RECOVERY_TARGET_TIME;
+ if (strcmp(item->value, "epoch") == 0 ||
+ strcmp(item->value, "infinity") == 0 ||
+ strcmp(item->value, "-infinity") == 0 ||
+ strcmp(item->value, "now") == 0 ||
+ strcmp(item->value, "today") == 0 ||
+ strcmp(item->value, "tomorrow") == 0 ||
+ strcmp(item->value, "yesterday") == 0)
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery_target_time is not a valid timestamp: \"%s\"",
+ item->value)));
+
/*
* Convert the time string given by the user to TimestampTz form.
*/
recoveryTargetTime =
DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
- CStringGetDatum(item->value),
- ObjectIdGetDatum(InvalidOid),
+ CStringGetDatum(item->value),
+ ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1)));
ereport(DEBUG2,
(errmsg_internal("recovery_target_time = '%s'",
- timestamptz_to_str(recoveryTargetTime))));
+ timestamptz_to_str(recoveryTargetTime))));
}
else if (strcmp(item->name, "recovery_target_name") == 0)
{
DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
CStringGetDatum(item->value),
ObjectIdGetDatum(InvalidOid),
- Int32GetDatum(-1)));
+ Int32GetDatum(-1)));
ereport(DEBUG2,
(errmsg_internal("recovery_target_lsn = '%X/%X'",
(uint32) (recoveryTargetLSN >> 32),
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
- "recovery_target",
- item->value),
- errhint("The only allowed value is \"immediate\".")));
+ errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
+ "recovery_target",
+ item->value),
+ errhint("The only allowed value is \"immediate\".")));
ereport(DEBUG2,
(errmsg_internal("recovery_target = '%s'",
item->value)));
if (StandbyModeRequested && !IsUnderPostmaster)
ereport(FATAL,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("standby mode is not supported by single-user servers")));
+ errmsg("standby mode is not supported by single-user servers")));
/* Enable fetching from archive recovery area */
ArchiveRecoveryRequested = true;
* they are the same, but if the switch happens exactly at a segment
* boundary, startLogSegNo will be endLogSegNo + 1.
*/
- XLByteToPrevSeg(endOfLog, endLogSegNo);
- XLByteToSeg(endOfLog, startLogSegNo);
+ XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
+ XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
/*
* Initialize the starting WAL segment for the new timeline. If the switch
* avoid emplacing a bogus file.
*/
XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
- endOfLog % XLOG_SEG_SIZE);
+ XLogSegmentOffset(endOfLog, wal_segment_size));
}
else
{
if (close(fd))
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not close log file %s: %m",
+ errmsg("could not close file \"%s\": %m",
XLogFileNameP(ThisTimeLineID, startLogSegNo))));
}
* Let's just make real sure there are not .ready or .done flags posted
* for the new segment.
*/
- XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
+ XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
XLogArchiveCleanup(xlogfname);
/*
* For point-in-time recovery, this function decides whether we want to
* stop applying the XLOG before the current record.
*
- * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
+ * Returns true if we are stopping, false otherwise. If stopping, some
* information is saved in recoveryStopXid et al for use in annotating the
* new timeline's history file.
*/
recoveryStopTime = 0;
recoveryStopName[0] = '\0';
ereport(LOG,
- (errmsg("recovery stopping before WAL position (LSN) \"%X/%X\"",
+ (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
(uint32) (recoveryStopLSN >> 32),
(uint32) recoveryStopLSN)));
return true;
strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
ereport(LOG,
- (errmsg("recovery stopping at restore point \"%s\", time %s",
- recoveryStopName,
- timestamptz_to_str(recoveryStopTime))));
+ (errmsg("recovery stopping at restore point \"%s\", time %s",
+ recoveryStopName,
+ timestamptz_to_str(recoveryStopTime))));
return true;
}
}
recoveryStopTime = 0;
recoveryStopName[0] = '\0';
ereport(LOG,
- (errmsg("recovery stopping after WAL position (LSN) \"%X/%X\"",
+ (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
(uint32) (recoveryStopLSN >> 32),
(uint32) recoveryStopLSN)));
return true;
struct stat st;
/*
- * Read control file and check XLOG status looks valid.
- *
- * Note: in most control paths, *ControlFile is already valid and we need
- * not do ReadControlFile() here, but might as well do it to be sure.
+ * We should have an aux process resource owner to use, and we should not
+ * be in a transaction that's installed some other resowner.
*/
- ReadControlFile();
+ Assert(AuxProcessResourceOwner != NULL);
+ Assert(CurrentResourceOwner == NULL ||
+ CurrentResourceOwner == AuxProcessResourceOwner);
+ CurrentResourceOwner = AuxProcessResourceOwner;
+ /*
+ * Verify XLOG status looks valid.
+ */
if (ControlFile->state < DB_SHUTDOWNED ||
ControlFile->state > DB_IN_PRODUCTION ||
!XRecOffIsValid(ControlFile->checkPoint))
str_time(ControlFile->time))));
else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
ereport(LOG,
- (errmsg("database system was interrupted while in recovery at %s",
- str_time(ControlFile->time)),
- errhint("This probably means that some data is corrupted and"
- " you will have to use the last backup for recovery.")));
+ (errmsg("database system was interrupted while in recovery at %s",
+ str_time(ControlFile->time)),
+ errhint("This probably means that some data is corrupted and"
+ " you will have to use the last backup for recovery.")));
else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
ereport(LOG,
(errmsg("database system was interrupted while in recovery at log time %s",
str_time(ControlFile->checkPointCopy.time)),
errhint("If this has occurred more than once some data might be corrupted"
- " and you might need to choose an earlier recovery target.")));
+ " and you might need to choose an earlier recovery target.")));
else if (ControlFile->state == DB_IN_PRODUCTION)
ereport(LOG,
- (errmsg("database system was interrupted; last known up at %s",
- str_time(ControlFile->time))));
+ (errmsg("database system was interrupted; last known up at %s",
+ str_time(ControlFile->time))));
/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
*/
ValidateXLOGDirectoryStructure();
- /*
- * If we previously crashed, there might be data which we had written,
- * intending to fsync it, but which we had not actually fsync'd yet.
- * Therefore, a power failure in the near future might cause earlier
- * unflushed writes to be lost, even though more recent data written to
- * disk from here on would be persisted. To avoid that, fsync the entire
- * data directory.
+ /*----------
+ * If we previously crashed, perform a couple of actions:
+ * - The pg_wal directory may still include some temporary WAL segments
+ * used when creating a new segment, so perform some clean up to not
+ * bloat this path. This is done first as there is no point to sync this
+ * temporary data.
+ * - There might be data which we had written, intending to fsync it,
+ * but which we had not actually fsync'd yet. Therefore, a power failure
+ * in the near future might cause earlier unflushed writes to be lost,
+ * even though more recent data written to disk from here on would be
+ * persisted. To avoid that, fsync the entire data directory.
+ *---------
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+ {
+ RemoveTempXlogFiles();
SyncDataDirectory();
+ }
/*
* Initialize on the assumption we want to recover to the latest timeline
recoveryTargetName)));
else if (recoveryTarget == RECOVERY_TARGET_LSN)
ereport(LOG,
- (errmsg("starting point-in-time recovery to WAL position (LSN) \"%X/%X\"",
+ (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
(uint32) (recoveryTargetLSN >> 32),
(uint32) recoveryTargetLSN)));
else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
/* Set up XLOG reader facility */
MemSet(&private, 0, sizeof(XLogPageReadPrivate));
- xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
+ xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
if (!xlogreader)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory"),
- errdetail("Failed while allocating a WAL reading processor.")));
+ errdetail("Failed while allocating a WAL reading processor.")));
xlogreader->system_identifier = ControlFile->system_identifier;
/*
- * Allocate pages dedicated to WAL consistency checks, those had better
- * be aligned.
+ * Allocate two page buffers dedicated to WAL consistency checks. We do
+ * it this way, rather than just making static arrays, for two reasons:
+ * (1) no need to waste the storage in most instantiations of the backend;
+ * (2) a static char array isn't guaranteed to have any particular
+ * alignment, whereas palloc() will provide MAXALIGN'd storage.
*/
replay_image_masked = (char *) palloc(BLCKSZ);
master_image_masked = (char *) palloc(BLCKSZ);
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
+ (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
/*
if (symlink(ti->path, linkloc) < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not create symbolic link \"%s\": %m",
- linkloc)));
+ errmsg("could not create symbolic link \"%s\": %m",
+ linkloc)));
pfree(ti->oid);
pfree(ti->path);
unlink(TABLESPACE_MAP_OLD);
if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
ereport(LOG,
- (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
- TABLESPACE_MAP, BACKUP_LABEL_FILE),
- errdetail("File \"%s\" was renamed to \"%s\".",
- TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("File \"%s\" was renamed to \"%s\".",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
else
ereport(LOG,
- (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
- TABLESPACE_MAP, BACKUP_LABEL_FILE),
- errdetail("Could not rename file \"%s\" to \"%s\": %m.",
- TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("Could not rename file \"%s\" to \"%s\": %m.",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
}
/*
StandbyMode = true;
}
- /*
- * Get the last valid checkpoint record. If the latest one according
- * to pg_control is broken, try the next-to-last one.
- */
+ /* Get the last valid checkpoint record. */
checkPointLoc = ControlFile->checkPoint;
RedoStartLSN = ControlFile->checkPointCopy.redo;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
{
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
+ (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
}
- else if (StandbyMode)
+ else
{
/*
- * The last valid checkpoint record required for a streaming
- * recovery exists in neither standby nor the primary.
+ * We used to attempt to go back to a secondary checkpoint record
+ * here, but only when not in standby_mode. We now just fail if we
+ * can't read the last checkpoint because this allows us to
+ * simplify processing around checkpoints.
*/
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
- else
- {
- checkPointLoc = ControlFile->prevCheckPoint;
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
- if (record != NULL)
- {
- ereport(LOG,
- (errmsg("using previous checkpoint record at %X/%X",
- (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
- InRecovery = true; /* force recovery even if SHUTDOWNED */
- }
- else
- ereport(PANIC,
- (errmsg("could not locate a valid checkpoint record")));
- }
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
}
* history, too.
*/
if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
- tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+ tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
ControlFile->minRecoveryPointTLI)
ereport(FATAL,
(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
ereport(DEBUG1,
(errmsg_internal("redo record is at %X/%X; shutdown %s",
- (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
- wasShutdown ? "TRUE" : "FALSE")));
+ (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
+ wasShutdown ? "true" : "false")));
ereport(DEBUG1,
(errmsg_internal("next transaction ID: %u:%u; next OID: %u",
checkPoint.nextXidEpoch, checkPoint.nextXid,
checkPoint.nextOid)));
ereport(DEBUG1,
(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
- checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+ checkPoint.nextMulti, checkPoint.nextMultiOffset)));
ereport(DEBUG1,
- (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
- checkPoint.oldestXid, checkPoint.oldestXidDB)));
+ (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
+ checkPoint.oldestXid, checkPoint.oldestXidDB)));
ereport(DEBUG1,
(errmsg_internal("oldest MultiXactId: %u, in database %u",
- checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
+ checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
ereport(DEBUG1,
(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
checkPoint.oldestCommitTsXid,
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+ AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
- SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
SetCommitTsLimit(checkPoint.oldestCommitTsXid,
checkPoint.newestCommitTsXid);
XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
/*
* Copy any missing timeline history files between 'now' and the recovery
- * target timeline from archive to pg_wal. While we don't need those
- * files ourselves - the history file of the recovery target timeline
- * covers all the previous timelines in the history too - a cascading
- * standby server might be interested in them. Or, if you archive the WAL
- * from this server to a different archive than the master, it'd be good
- * for all the history files to get archived there after failover, so that
- * you can use one of the old timelines as a PITR target. Timeline history
- * files are small, so it's better to copy them unnecessarily than not
- * copy them and regret later.
+ * target timeline from archive to pg_wal. While we don't need those files
+ * ourselves - the history file of the recovery target timeline covers all
+ * the previous timelines in the history too - a cascading standby server
+ * might be interested in them. Or, if you archive the WAL from this
+ * server to a different archive than the master, it'd be good for all the
+ * history files to get archived there after failover, so that you can use
+ * one of the old timelines as a PITR target. Timeline history files are
+ * small, so it's better to copy them unnecessarily than not copy them and
+ * regret later.
*/
restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
+ /*
+ * Before running in recovery, scan pg_twophase and fill in its status to
+ * be able to work on entries generated by redo. Doing a scan before
+ * taking any recovery action has the merit to discard any 2PC files that
+ * are newer than the first record to replay, saving from any conflicts at
+ * replay. This avoids as well any subsequent scans when doing recovery
+ * of the on-disk two-phase data.
+ */
+ restoreTwoPhaseData();
+
lastFullPageWrites = checkPoint.fullPageWrites;
RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
recoveryTargetTLI)));
ControlFile->state = DB_IN_CRASH_RECOVERY;
}
- ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
if (InArchiveRecovery)
ereport(FATAL,
(errmsg("backup_label contains data inconsistent with control file"),
errhint("This means that the backup is corrupted and you will "
- "have to use another backup for recovery.")));
+ "have to use another backup for recovery.")));
ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
}
}
/* No need to hold ControlFileLock yet, we aren't up far enough */
UpdateControlFile();
- /* initialize our local copy of minRecoveryPoint */
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ /*
+ * Initialize our local copy of minRecoveryPoint. When doing crash
+ * recovery we want to replay up to the end of WAL. Particularly, in
+ * the case of a promoted standby minRecoveryPoint value in the
+ * control file is only updated after the first checkpoint. However,
+ * if the instance crashes before the first post-recovery checkpoint
+ * is completed then recovery will use a stale location causing the
+ * startup process to think that there are still invalid page
+ * references when checking for data consistency.
+ */
+ if (InArchiveRecovery)
+ {
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+ minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ else
+ {
+ minRecoveryPoint = InvalidXLogRecPtr;
+ minRecoveryPointTLI = 0;
+ }
/*
* Reset pgstat data, because it may be invalid after recovery.
ProcArrayApplyRecoveryInfo(&running);
- StandbyRecoverPreparedTransactions(false);
+ StandbyRecoverPreparedTransactions();
}
}
ereport(LOG,
(errmsg("redo starts at %X/%X",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
/*
* main redo apply loop
#ifdef WAL_DEBUG
if (XLOG_DEBUG ||
- (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+ (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
{
StringInfoData buf;
initStringInfo(&buf);
appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
- (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
+ (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
xlog_outrec(&buf, xlogreader);
appendStringInfoString(&buf, " - ");
xlog_outdesc(&buf, xlogreader);
ereport(LOG,
(errmsg("redo done at %X/%X",
- (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
+ (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
xtime = GetLatestXTime();
if (xtime)
ereport(LOG,
- (errmsg("last completed transaction was at log time %s",
- timestamptz_to_str(xtime))));
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(xtime))));
InRedo = false;
}
errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
else
ereport(FATAL,
- (errmsg("WAL ends before consistent recovery point")));
+ (errmsg("WAL ends before consistent recovery point")));
}
}
+ /*
+ * Pre-scan prepared transactions to find out the range of XIDs present.
+ * This information is not quite needed yet, but it is positioned here so
+ * as potential problems are detected before any on-disk change is done.
+ */
+ oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+
/*
* Consider whether we need to assign a new timeline ID.
*
snprintf(reason, sizeof(reason),
"%s LSN %X/%X\n",
recoveryStopAfter ? "after" : "before",
- (uint32 ) (recoveryStopLSN >> 32),
+ (uint32) (recoveryStopLSN >> 32),
(uint32) recoveryStopLSN);
else if (recoveryTarget == RECOVERY_TARGET_NAME)
snprintf(reason, sizeof(reason),
else
snprintf(reason, sizeof(reason), "no recovery target specified");
+ /*
+ * We are now done reading the old WAL. Turn off archive fetching if
+ * it was active, and make a writable copy of the last WAL segment.
+ * (Note that we also have a copy of the last block of the old WAL in
+ * readBuf; we will use that below.)
+ */
+ exitArchiveRecovery(EndOfLogTLI, EndOfLog);
+
+ /*
+ * Write the timeline history file, and have it archived. After this
+ * point (or rather, as soon as the file is archived), the timeline
+ * will appear as "taken" in the WAL archive and to any standby
+ * servers. If we crash before actually switching to the new
+ * timeline, standby servers will nevertheless think that we switched
+ * to the new timeline, and will try to connect to the new timeline.
+ * To minimize the window for that, try to do as little as possible
+ * between here and writing the end-of-recovery record.
+ */
writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
EndRecPtr, reason);
}
XLogCtl->PrevTimeLineID = PrevTimeLineID;
/*
- * We are now done reading the old WAL. Turn off archive fetching if it
- * was active, and make a writable copy of the last WAL segment. (Note
- * that we also have a copy of the last block of the old WAL in readBuf;
- * we will use that below.)
- */
- if (ArchiveRecoveryRequested)
- exitArchiveRecovery(EndOfLogTLI, EndOfLog);
-
- /*
- * Prepare to write WAL starting at EndOfLog position, and init xlog
+ * Prepare to write WAL starting at EndOfLog location, and init xlog
* buffer cache using the block containing the last record from the
* previous incarnation.
*/
XLogRecPtr pageBeginPtr;
pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
- Assert(readOff == pageBeginPtr % XLogSegSize);
+ Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
firstIdx = XLogRecPtrToBufIdx(EndOfLog);
XLogCtl->LogwrtRqst.Write = EndOfLog;
XLogCtl->LogwrtRqst.Flush = EndOfLog;
- /* Pre-scan prepared transactions to find out the range of XIDs present */
- oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
-
/*
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
* record before resource manager writes cleanup WAL records or checkpoint
{
if (fast_promote)
{
- checkPointLoc = ControlFile->prevCheckPoint;
+ checkPointLoc = ControlFile->checkPoint;
/*
* Confirm the last checkpoint is available for us to recover
- * from if we fail. Note that we don't check for the secondary
- * checkpoint since that isn't available in most base backups.
+ * from if we fail.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
if (record != NULL)
* restored from the archive to begin with, it's expected to have a
* .done file).
*/
- if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
+ if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
+ XLogArchivingActive())
{
char origfname[MAXFNAMELEN];
XLogSegNo endLogSegNo;
- XLByteToPrevSeg(EndOfLog, endLogSegNo);
- XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
+ XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+ XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
if (!XLogArchiveIsReadyOrDone(origfname))
{
char partialfname[MAXFNAMELEN];
char partialpath[MAXPGPATH];
- XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
+ XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
if (XLogRecPtrIsInvalid(minRecoveryPoint))
return;
+ Assert(InArchiveRecovery);
+
/*
* assume that we are called in the startup process, and hence don't need
* a lock to read lastReplayedEndRecPtr
/*
* Have we passed our safe starting point? Note that minRecoveryPoint is
* known to be incorrectly set if ControlFile->backupEndRequired, until
- * the XLOG_BACKUP_RECORD arrives to advise us of the correct
+ * the XLOG_BACKUP_END arrives to advise us of the correct
* minRecoveryPoint. All we know prior to that is that we're not
* consistent yet.
*/
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
+ * 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
{
case 1:
ereport(LOG,
- (errmsg("invalid primary checkpoint link in control file")));
- break;
- case 2:
- ereport(LOG,
- (errmsg("invalid secondary checkpoint link in control file")));
+ (errmsg("invalid primary checkpoint link in control file")));
break;
default:
ereport(LOG,
- (errmsg("invalid checkpoint link in backup_label file")));
+ (errmsg("invalid checkpoint link in backup_label file")));
break;
}
return NULL;
ereport(LOG,
(errmsg("invalid primary checkpoint record")));
break;
- case 2:
- ereport(LOG,
- (errmsg("invalid secondary checkpoint record")));
- break;
default:
ereport(LOG,
(errmsg("invalid checkpoint record")));
ereport(LOG,
(errmsg("invalid resource manager ID in primary checkpoint record")));
break;
- case 2:
- ereport(LOG,
- (errmsg("invalid resource manager ID in secondary checkpoint record")));
- break;
default:
ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
+ (errmsg("invalid resource manager ID in checkpoint record")));
break;
}
return NULL;
{
case 1:
ereport(LOG,
- (errmsg("invalid xl_info in primary checkpoint record")));
- break;
- case 2:
- ereport(LOG,
- (errmsg("invalid xl_info in secondary checkpoint record")));
+ (errmsg("invalid xl_info in primary checkpoint record")));
break;
default:
ereport(LOG,
{
case 1:
ereport(LOG,
- (errmsg("invalid length of primary checkpoint record")));
- break;
- case 2:
- ereport(LOG,
- (errmsg("invalid length of secondary checkpoint record")));
+ (errmsg("invalid length of primary checkpoint record")));
break;
default:
ereport(LOG,
ThisTimeLineID = XLogCtl->ThisTimeLineID;
Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
+ /* set wal_segment_size */
+ wal_segment_size = ControlFile->xlog_seg_size;
+
/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
(void) GetRedoRecPtr();
/* Also update our copy of doPageWrites. */
void
ShutdownXLOG(int code, Datum arg)
{
+ /*
+ * We should have an aux process resource owner to use, and we should not
+ * be in a transaction that's installed some other resowner.
+ */
+ Assert(AuxProcessResourceOwner != NULL);
+ Assert(CurrentResourceOwner == NULL ||
+ CurrentResourceOwner == AuxProcessResourceOwner);
+ CurrentResourceOwner = AuxProcessResourceOwner;
+
/* Don't be chatty in standalone mode */
ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("shutting down")));
+ /*
+ * Signal walsenders to move to stopping state.
+ */
+ WalSndInitStopping();
+
+ /*
+ * Wait for WAL senders to be in stopping state. This prevents commands
+ * from writing new WAL.
+ */
+ WalSndWaitStopping();
+
if (RecoveryInProgress())
CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
else
*/
longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
longest_usecs = CheckpointStats.ckpt_longest_sync -
- (uint64) longest_secs *1000000;
+ (uint64) longest_secs * 1000000;
average_sync_time = 0;
if (CheckpointStats.ckpt_sync_rels > 0)
average_sync_time = CheckpointStats.ckpt_agg_sync_time /
CheckpointStats.ckpt_sync_rels;
average_secs = (long) (average_sync_time / 1000000);
- average_usecs = average_sync_time - (uint64) average_secs *1000000;
+ average_usecs = average_sync_time - (uint64) average_secs * 1000000;
elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
- "%d transaction log file(s) added, %d removed, %d recycled; "
+ "%d WAL file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
"distance=%d kB, estimate=%d kB",
* more.
*
* When checkpoints are triggered by max_wal_size, this should converge to
- * CheckpointSegments * XLOG_SEG_SIZE,
+ * CheckpointSegments * wal_segment_size,
*
* Note: This doesn't pay any attention to what caused the checkpoint.
* Checkpoints triggered manually with CHECKPOINT command, or by e.g.
bool shutdown;
CheckPoint checkPoint;
XLogRecPtr recptr;
+ XLogSegNo _logSegNo;
XLogCtlInsert *Insert = &XLogCtl->Insert;
uint32 freespace;
XLogRecPtr PriorRedoPtr;
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
ereport(DEBUG1,
- (errmsg("checkpoint skipped due to an idle system")));
+ (errmsg("checkpoint skipped because system is idle")));
return;
}
}
freespace = INSERT_FREESPACE(curInsert);
if (freespace == 0)
{
- if (curInsert % XLogSegSize == 0)
+ if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
curInsert += SizeOfXLogLongPHD;
else
curInsert += SizeOfXLogShortPHD;
/*
* Get the other info we need for the checkpoint record.
+ *
+ * We don't need to save oldestClogXid in the checkpoint, it only matters
+ * for the short period in which clog is being truncated, and if we crash
+ * during that we'll redo the clog truncation and fix up oldestClogXid
+ * there.
*/
LWLockAcquire(XidGenLock, LW_SHARED);
checkPoint.nextXid = ShmemVariableCache->nextXid;
* that are currently in commit critical sections. If an xact inserted
* its commit record into XLOG just before the REDO point, then a crash
* restart from the REDO point would not replay that record, which means
- * that our flushing had better include the xact's update of pg_clog. So
+ * that our flushing had better include the xact's update of pg_xact. So
* we wait till he's out of his commit critical section before proceeding.
* See notes in RecordTransactionCommit().
*
if (shutdown)
{
if (flags & CHECKPOINT_END_OF_RECOVERY)
- LocalXLogInsertAllowed = -1; /* return to "check" state */
+ LocalXLogInsertAllowed = -1; /* return to "check" state */
else
LocalXLogInsertAllowed = 0; /* never again write WAL */
}
*/
if (shutdown && checkPoint.redo != ProcLastRecPtr)
ereport(PANIC,
- (errmsg("concurrent transaction log activity while database system is shutting down")));
+ (errmsg("concurrent write-ahead log activity while database system is shutting down")));
/*
- * Remember the prior checkpoint's redo pointer, used later to determine
- * the point where the log can be truncated.
+ * Remember the prior checkpoint's redo ptr for
+ * UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
- ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
smgrpostckpt();
/*
- * Delete old log files (those no longer needed even for previous
- * checkpoint or the standbys in XLOG streaming).
+ * Update the average distance between checkpoints if the prior checkpoint
+ * exists.
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
- {
- XLogSegNo _logSegNo;
-
- /* Update the average distance between checkpoints. */
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
- XLByteToSeg(PriorRedoPtr, _logSegNo);
- KeepLogSeg(recptr, &_logSegNo);
- _logSegNo--;
- RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
- }
+ /*
+ * Delete old log files, those no longer needed for last checkpoint to
+ * prevent the disk holding the xlog from growing full.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(recptr, &_logSegNo);
+ _logSegNo--;
+ RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
/*
* Make more log segments if needed. (Do this after recycling old log
* StartupSUBTRANS hasn't been called yet.
*/
if (!RecoveryInProgress())
- TruncateSUBTRANS(GetOldestXmin(NULL, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
/* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false);
XLogRecPtr lastCheckPointEndPtr;
CheckPoint lastCheckPoint;
XLogRecPtr PriorRedoPtr;
+ XLogRecPtr receivePtr;
+ XLogRecPtr replayPtr;
+ TimeLineID replayTLI;
+ XLogRecPtr endptr;
+ XLogSegNo _logSegNo;
TimestampTz xtime;
/*
if (!RecoveryInProgress())
{
ereport(DEBUG2,
- (errmsg("skipping restartpoint, recovery has already ended")));
+ (errmsg("skipping restartpoint, recovery has already ended")));
LWLockRelease(CheckpointLock);
return false;
}
CheckPointGuts(lastCheckPoint.redo, flags);
/*
- * Remember the prior checkpoint's redo pointer, used later to determine
- * the point at which we can truncate the log.
+ * Remember the prior checkpoint's redo ptr for
+ * UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
{
- ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = lastCheckPointRecPtr;
ControlFile->checkPointCopy = lastCheckPoint;
ControlFile->time = (pg_time_t) time(NULL);
LWLockRelease(ControlFileLock);
/*
- * Delete old log files (those no longer needed even for previous
- * checkpoint/restartpoint) to prevent the disk holding the xlog from
- * growing full.
+ * Update the average distance between checkpoints/restartpoints if the
+ * prior checkpoint exists.
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
- {
- XLogRecPtr receivePtr;
- XLogRecPtr replayPtr;
- TimeLineID replayTLI;
- XLogRecPtr endptr;
- XLogSegNo _logSegNo;
-
- /* Update the average distance between checkpoints/restartpoints. */
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
- XLByteToSeg(PriorRedoPtr, _logSegNo);
-
- /*
- * Get the current end of xlog replayed or received, whichever is
- * later.
- */
- receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
- replayPtr = GetXLogReplayRecPtr(&replayTLI);
- endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
+ /*
+ * Delete old log files, those no longer needed for last restartpoint to
+ * prevent the disk holding the xlog from growing full.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
- KeepLogSeg(endptr, &_logSegNo);
- _logSegNo--;
+ /*
+ * Retreat _logSegNo using the current end of xlog replayed or received,
+ * whichever is later.
+ */
+ receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
+ replayPtr = GetXLogReplayRecPtr(&replayTLI);
+ endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
+ KeepLogSeg(endptr, &_logSegNo);
+ _logSegNo--;
- /*
- * Try to recycle segments on a useful timeline. If we've been
- * promoted since the beginning of this restartpoint, use the new
- * timeline chosen at end of recovery (RecoveryInProgress() sets
- * ThisTimeLineID in that case). If we're still in recovery, use the
- * timeline we're currently replaying.
- *
- * There is no guarantee that the WAL segments will be useful on the
- * current timeline; if recovery proceeds to a new timeline right
- * after this, the pre-allocated WAL segments on this timeline will
- * not be used, and will go wasted until recycled on the next
- * restartpoint. We'll live with that.
- */
- if (RecoveryInProgress())
- ThisTimeLineID = replayTLI;
+ /*
+ * Try to recycle segments on a useful timeline. If we've been promoted
+ * since the beginning of this restartpoint, use the new timeline chosen
+ * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
+ * case). If we're still in recovery, use the timeline we're currently
+ * replaying.
+ *
+ * There is no guarantee that the WAL segments will be useful on the
+ * current timeline; if recovery proceeds to a new timeline right after
+ * this, the pre-allocated WAL segments on this timeline will not be used,
+ * and will go wasted until recycled on the next restartpoint. We'll live
+ * with that.
+ */
+ if (RecoveryInProgress())
+ ThisTimeLineID = replayTLI;
- RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
+ RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
- /*
- * Make more log segments if needed. (Do this after recycling old log
- * segments, since that may supply some of the needed files.)
- */
- PreallocXlogFiles(endptr);
+ /*
+ * Make more log segments if needed. (Do this after recycling old log
+ * segments, since that may supply some of the needed files.)
+ */
+ PreallocXlogFiles(endptr);
- /*
- * ThisTimeLineID is normally not set when we're still in recovery.
- * However, recycling/preallocating segments above needed
- * ThisTimeLineID to determine which timeline to install the segments
- * on. Reset it now, to restore the normal state of affairs for
- * debugging purposes.
- */
- if (RecoveryInProgress())
- ThisTimeLineID = 0;
- }
+ /*
+ * ThisTimeLineID is normally not set when we're still in recovery.
+ * However, recycling/preallocating segments above needed ThisTimeLineID
+ * to determine which timeline to install the segments on. Reset it now,
+ * to restore the normal state of affairs for debugging purposes.
+ */
+ if (RecoveryInProgress())
+ ThisTimeLineID = 0;
/*
* Truncate pg_subtrans if possible. We can throw away all data before
* this because StartupSUBTRANS hasn't been called yet.
*/
if (EnableHotStandby)
- TruncateSUBTRANS(GetOldestXmin(NULL, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
/* Real work is done, but log and update before releasing lock. */
LogCheckpointEnd(true);
xtime = GetLatestXTime();
ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
- (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
- xtime ? errdetail("last completed transaction was at log time %s",
- timestamptz_to_str(xtime)) : 0));
+ (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
+ xtime ? errdetail("Last completed transaction was at log time %s.",
+ timestamptz_to_str(xtime)) : 0));
LWLockRelease(CheckpointLock);
XLogSegNo segno;
XLogRecPtr keep;
- XLByteToSeg(recptr, segno);
+ XLByteToSeg(recptr, segno, wal_segment_size);
keep = XLogGetReplicationSlotMinimumLSN();
/* compute limit for wal_keep_segments first */
{
XLogSegNo slotSegNo;
- XLByteToSeg(keep, slotSegNo);
+ XLByteToSeg(keep, slotSegNo, wal_segment_size);
if (slotSegNo <= 0)
segno = 1;
*/
if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
ereport(PANIC,
- (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- newTLI, ThisTimeLineID)));
+ (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+ newTLI, ThisTimeLineID)));
/*
* If we have not yet reached min recovery point, and we're about to
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
+
+ /*
+ * No need to set oldestClogXid here as well; it'll be set when we
+ * redo an xl_clog_truncate if it changed since initialization.
+ */
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
/*
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
ereport(PANIC,
- (errmsg("online backup was canceled, recovery cannot continue")));
+ (errmsg("online backup was canceled, recovery cannot continue")));
/*
* If we see a shutdown checkpoint, we know that nothing was running
ProcArrayApplyRecoveryInfo(&running);
- StandbyRecoverPreparedTransactions(true);
+ StandbyRecoverPreparedTransactions();
}
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
checkPoint.nextXid))
ShmemVariableCache->nextXid = checkPoint.nextXid;
LWLockRelease(XidGenLock);
- /* ... but still treat OID counter as exact */
- LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
- ShmemVariableCache->nextOid = checkPoint.nextOid;
- ShmemVariableCache->oidCount = 0;
- LWLockRelease(OidGenLock);
+
+ /*
+ * We ignore the nextOid counter in an ONLINE checkpoint, preferring
+ * to track OID assignment through XLOG_NEXTOID records. The nextOid
+ * counter is from the start of the checkpoint and might well be stale
+ * compared to later XLOG_NEXTOID records. We could try to take the
+ * maximum of the nextOid counter and our latest value, but since
+ * there's no particular guarantee about the speed with which the OID
+ * counter wraps around, that's a risky thing to do. In any case,
+ * users of the nextOid counter are required to avoid assignment of
+ * duplicates, so that a somewhat out-of-date value should be safe.
+ */
+
+ /* Handle multixact */
MultiXactAdvanceNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
* Update minRecoveryPoint to ensure that if recovery is aborted, we
* recover back up to this point before allowing hot standby again.
* This is important if the max_* settings are decreased, to ensure
- * you don't run queries against the WAL preceding the change.
+ * you don't run queries against the WAL preceding the change. The
+ * local copies cannot be updated as long as crash recovery is
+ * happening and we expect all the WAL to be replayed.
*/
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
- if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
+ if (InArchiveRecovery)
+ {
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+ minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
{
ControlFile->minRecoveryPoint = lsn;
ControlFile->minRecoveryPointTLI = ThisTimeLineID;
appendStringInfoString(buf, " FPW");
}
}
-#endif /* WAL_DEBUG */
+#endif /* WAL_DEBUG */
/*
* Returns a string describing an XLogRecord, consisting of its identity
*/
if (openLogFile >= 0)
{
+ pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
if (pg_fsync(openLogFile) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync log segment %s: %m",
- XLogFileNameP(ThisTimeLineID, openLogSegNo))));
+ errmsg("could not fsync file \"%s\": %m",
+ XLogFileNameP(ThisTimeLineID, openLogSegNo))));
+ pgstat_report_wait_end();
if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
XLogFileClose();
}
void
issue_xlog_fsync(int fd, XLogSegNo segno)
{
+ pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
switch (sync_method)
{
case SYNC_METHOD_FSYNC:
if (pg_fsync_no_writethrough(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync log file %s: %m",
+ errmsg("could not fsync file \"%s\": %m",
XLogFileNameP(ThisTimeLineID, segno))));
break;
#ifdef HAVE_FSYNC_WRITETHROUGH
if (pg_fsync_writethrough(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fsync write-through log file %s: %m",
- XLogFileNameP(ThisTimeLineID, segno))));
+ errmsg("could not fsync write-through file \"%s\": %m",
+ XLogFileNameP(ThisTimeLineID, segno))));
break;
#endif
#ifdef HAVE_FDATASYNC
if (pg_fdatasync(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
- errmsg("could not fdatasync log file %s: %m",
+ errmsg("could not fdatasync file \"%s\": %m",
XLogFileNameP(ThisTimeLineID, segno))));
break;
#endif
elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
break;
}
+ pgstat_report_wait_end();
}
/*
{
char *result = palloc(MAXFNAMELEN);
- XLogFileName(result, tli, segno);
+ XLogFileName(result, tli, segno, wal_segment_size);
return result;
}
* when backup needs to generate tablespace_map file, it is used to
* embed escape character before newline character in tablespace path.
*
- * Returns the minimum WAL position that must be present to restore from this
+ * Returns the minimum WAL location that must be present to restore from this
* backup, and the corresponding timeline ID in *starttli_p.
*
* Every successfully started non-exclusive backup must be stopped by calling
*/
XLogRecPtr
do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
- StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
+ StringInfo labelfile, List **tablespaces,
StringInfo tblspcmapfile, bool infotbssize,
bool needtblspcmapfile)
{
if (!backup_started_in_recovery && !XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("WAL level not sufficient for making an online backup"),
+ errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
if (strlen(backupidstr) > MAXPGPATH)
if (exclusive)
{
/*
- * At first, mark that we're now starting an exclusive backup,
- * to ensure that there are no other sessions currently running
+ * At first, mark that we're now starting an exclusive backup, to
+ * ensure that there are no other sessions currently running
* pg_start_backup() or pg_stop_backup().
*/
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
{
bool gotUniqueStartpoint = false;
+ DIR *tblspcdir;
struct dirent *de;
tablespaceinfo *ti;
int datadirpathlen;
if (!checkpointfpw || startpoint <= recptr)
ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("WAL generated with full_page_writes=off was replayed "
- "since last restartpoint"),
- errhint("This means that the backup being taken on the standby "
- "is corrupt and should not be used. "
- "Enable full_page_writes and run CHECKPOINT on the master, "
- "and then try an online backup again.")));
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL generated with full_page_writes=off was replayed "
+ "since last restartpoint"),
+ errhint("This means that the backup being taken on the standby "
+ "is corrupt and should not be used. "
+ "Enable full_page_writes and run CHECKPOINT on the master, "
+ "and then try an online backup again.")));
/*
* During recovery, since we don't use the end-of-backup WAL
WALInsertLockRelease();
} while (!gotUniqueStartpoint);
- XLByteToSeg(startpoint, _logSegNo);
- XLogFileName(xlogfilename, starttli, _logSegNo);
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
/*
* Construct tablespace_map file
datadirpathlen = strlen(DataDir);
/* Collect information about all tablespaces */
+ tblspcdir = AllocateDir("pg_tblspc");
while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
{
- char fullpath[MAXPGPATH];
+ char fullpath[MAXPGPATH + 10];
char linkpath[MAXPGPATH];
char *relpath = NULL;
int rllen;
appendStringInfoChar(&buflinkpath, *s++);
}
-
/*
* Relpath holds the relative path of the tablespace directory
* when it's located within PGDATA, or NULL if it's located
*/
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("tablespaces are not supported on this platform")));
+ errmsg("tablespaces are not supported on this platform")));
#endif
}
+ FreeDir(tblspcdir);
/*
* Construct backup label file
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&stamp_time, log_timezone));
appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
- (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
+ (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
- (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
+ (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
exclusive ? "pg_start_backup" : "streamed");
appendStringInfo(labelfile, "BACKUP FROM: %s\n",
backup_started_in_recovery ? "standby" : "master");
appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
+ appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
/*
* Okay, write the file, or return its contents to caller.
{
/*
* Check for existing backup label --- implies a backup is already
- * running. (XXX given that we checked exclusiveBackupState above,
- * maybe it would be OK to just unlink any such label file?)
+ * running. (XXX given that we checked exclusiveBackupState
+ * above, maybe it would be OK to just unlink any such label
+ * file?)
*/
if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
{
}
else
ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("a backup is already in progress"),
- errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
- TABLESPACE_MAP)));
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is already in progress"),
+ errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
+ TABLESPACE_MAP)));
fp = AllocateFile(TABLESPACE_MAP, "w");
/*
* Mark that start phase has correctly finished for an exclusive backup.
+ * Session-level locks are updated as well to reflect that state.
+ *
+ * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
+ * counters and session-level lock. Otherwise they can be updated
+ * inconsistently, and which might cause do_pg_abort_backup() to fail.
*/
if (exclusive)
{
WALInsertLockAcquireExclusive();
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
+
+ /* Set session-level lock */
+ sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
WALInsertLockRelease();
}
+ else
+ sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
/*
* We're done. As a convenience, return the starting WAL location.
WALInsertLockRelease();
}
+/*
+ * Utility routine to fetch the session-level status of a backup running.
+ */
+SessionBackupState
+get_backup_status(void)
+{
+ return sessionBackupState;
+}
+
/*
* do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
* function.
* If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
* the non-exclusive backup specified by 'labelfile'.
*
- * Returns the last WAL position that must be present to restore from this
+ * Returns the last WAL location that must be present to restore from this
* backup, and the corresponding timeline ID in *stoptli_p.
*
* It is the responsibility of the caller of this function to verify the
if (!backup_started_in_recovery && !XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("WAL level not sufficient for making an online backup"),
+ errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
if (exclusive)
{
/*
- * At first, mark that we're now stopping an exclusive backup,
- * to ensure that there are no other sessions currently running
+ * At first, mark that we're now stopping an exclusive backup, to
+ * ensure that there are no other sessions currently running
* pg_start_backup() or pg_stop_backup().
*/
WALInsertLockAcquireExclusive();
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
- if (unlink(BACKUP_LABEL_FILE) != 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m",
- BACKUP_LABEL_FILE)));
+ durable_unlink(BACKUP_LABEL_FILE, ERROR);
/*
- * Remove tablespace_map file if present, it is created only if there
- * are tablespaces.
+ * Remove tablespace_map file if present, it is created only if
+ * there are tablespaces.
*/
- unlink(TABLESPACE_MAP);
+ durable_unlink(TABLESPACE_MAP, DEBUG1);
}
PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
}
/*
- * OK to update backup counters and forcePageWrites
+ * OK to update backup counters, forcePageWrites and session-level lock.
+ *
+ * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
+ * Otherwise they can be updated inconsistently, and which might cause
+ * do_pg_abort_backup() to fail.
*/
WALInsertLockAcquireExclusive();
if (exclusive)
{
XLogCtl->Insert.forcePageWrites = false;
}
+
+ /*
+ * Clean up session-level lock.
+ *
+ * You might think that WALInsertLockRelease() can be called before
+ * cleaning up session-level lock because session-level lock doesn't need
+ * to be protected with WAL insertion lock. But since
+ * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
+ * cleaned up before it.
+ */
+ sessionBackupState = SESSION_BACKUP_NONE;
+
WALInsertLockRelease();
/*
* backup. We have no way of checking if pg_control wasn't backed up last
* however.
*
- * We don't force a switch to new WAL file and wait for all the required
- * files to be archived. This is okay if we use the backup to start the
- * standby. But, if it's for an archive recovery, to ensure all the
- * required files are available, a user should wait for them to be
- * archived, or include them into the backup.
+ * We don't force a switch to new WAL file but it is still possible to
+ * wait for all the required files to be archived if waitforarchive is
+ * true. This is okay if we use the backup to start a standby and fetch
+ * the missing WAL using streaming replication. But in the case of an
+ * archive recovery, a user should set waitforarchive to true and wait for
+ * them to be archived to ensure that all the required files are
+ * available.
*
* We return the current minimum recovery point as the backup end
* location. Note that it can be greater than the exact backup end
if (startpoint <= recptr)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("WAL generated with full_page_writes=off was replayed "
- "during online backup"),
- errhint("This means that the backup being taken on the standby "
- "is corrupt and should not be used. "
- "Enable full_page_writes and run CHECKPOINT on the master, "
- "and then try an online backup again.")));
+ errmsg("WAL generated with full_page_writes=off was replayed "
+ "during online backup"),
+ errhint("This means that the backup being taken on the standby "
+ "is corrupt and should not be used. "
+ "Enable full_page_writes and run CHECKPOINT on the master, "
+ "and then try an online backup again.")));
LWLockAcquire(ControlFileLock, LW_SHARED);
stoppoint = ControlFile->minRecoveryPoint;
stoptli = ControlFile->minRecoveryPointTLI;
LWLockRelease(ControlFileLock);
-
- if (stoptli_p)
- *stoptli_p = stoptli;
- return stoppoint;
}
+ else
+ {
+ /*
+ * Write the backup-end xlog record
+ */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
+ stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
+ stoptli = ThisTimeLineID;
- /*
- * Write the backup-end xlog record
- */
- XLogBeginInsert();
- XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
- stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
- stoptli = ThisTimeLineID;
+ /*
+ * Force a switch to a new xlog segment file, so that the backup is
+ * valid as soon as archiver moves out the current segment file.
+ */
+ RequestXLogSwitch(false);
- /*
- * Force a switch to a new xlog segment file, so that the backup is valid
- * as soon as archiver moves out the current segment file.
- */
- RequestXLogSwitch(false);
+ XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+ XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
- XLByteToPrevSeg(stoppoint, _logSegNo);
- XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
+ /* Use the log timezone here, not the session timezone */
+ stamp_time = (pg_time_t) time(NULL);
+ pg_strftime(strfbuf, sizeof(strfbuf),
+ "%Y-%m-%d %H:%M:%S %Z",
+ pg_localtime(&stamp_time, log_timezone));
- /* Use the log timezone here, not the session timezone */
- stamp_time = (pg_time_t) time(NULL);
- pg_strftime(strfbuf, sizeof(strfbuf),
- "%Y-%m-%d %H:%M:%S %Z",
- pg_localtime(&stamp_time, log_timezone));
+ /*
+ * Write the backup history file
+ */
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
+ startpoint, wal_segment_size);
+ fp = AllocateFile(histfilepath, "w");
+ if (!fp)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m",
+ histfilepath)));
+ fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
+ (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
+ fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
+ (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
- /*
- * Write the backup history file
- */
- XLByteToSeg(startpoint, _logSegNo);
- BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
- (uint32) (startpoint % XLogSegSize));
- fp = AllocateFile(histfilepath, "w");
- if (!fp)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create file \"%s\": %m",
- histfilepath)));
- fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
- (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
- fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
- (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
- /* transfer remaining lines from label to history file */
- fprintf(fp, "%s", remaining);
- fprintf(fp, "STOP TIME: %s\n", strfbuf);
- if (fflush(fp) || ferror(fp) || FreeFile(fp))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not write file \"%s\": %m",
- histfilepath)));
+ /*
+ * Transfer remaining lines including label and start timeline to
+ * history file.
+ */
+ fprintf(fp, "%s", remaining);
+ fprintf(fp, "STOP TIME: %s\n", strfbuf);
+ fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
+ if (fflush(fp) || ferror(fp) || FreeFile(fp))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ histfilepath)));
- /*
- * Clean out any no-longer-needed history files. As a side effect, this
- * will post a .ready file for the newly created history file, notifying
- * the archiver that history file may be archived immediately.
- */
- CleanupBackupHistory();
+ /*
+ * Clean out any no-longer-needed history files. As a side effect,
+ * this will post a .ready file for the newly created history file,
+ * notifying the archiver that history file may be archived
+ * immediately.
+ */
+ CleanupBackupHistory();
+ }
/*
* If archiving is enabled, wait for all the required WAL files to be
* archived before returning. If archiving isn't enabled, the required WAL
* needs to be transported via streaming replication (hopefully with
* wal_keep_segments set high enough), or some more exotic mechanism like
- * polling and copying files from pg_wal with script. We have no
- * knowledge of those mechanisms, so it's up to the user to ensure that he
- * gets all the required WAL.
+ * polling and copying files from pg_wal with script. We have no knowledge
+ * of those mechanisms, so it's up to the user to ensure that he gets all
+ * the required WAL.
*
* We wait until both the last WAL file filled during backup and the
* history file have been archived, and assume that the alphabetic sorting
*
* We wait forever, since archive_command is supposed to work and we
* assume the admin wanted his backup to work completely. If you don't
- * wish to wait, you can set statement_timeout. Also, some notices are
- * issued to clue in anyone who might be doing this interactively.
+ * wish to wait, then either waitforarchive should be passed in as false,
+ * or you can set statement_timeout. Also, some notices are issued to
+ * clue in anyone who might be doing this interactively.
*/
- if (waitforarchive && XLogArchivingActive())
+
+ if (waitforarchive &&
+ ((!backup_started_in_recovery && XLogArchivingActive()) ||
+ (backup_started_in_recovery && XLogArchivingAlways())))
{
- XLByteToPrevSeg(stoppoint, _logSegNo);
- XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
+ XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+ XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
- XLByteToSeg(startpoint, _logSegNo);
- BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
- (uint32) (startpoint % XLogSegSize));
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ BackupHistoryFileName(histfilename, stoptli, _logSegNo,
+ startpoint, wal_segment_size);
seconds_before_warning = 60;
waits = 0;
void
do_pg_abort_backup(void)
{
+ /*
+ * Quick exit if session is not keeping around a non-exclusive backup
+ * already started.
+ */
+ if (sessionBackupState == SESSION_BACKUP_NONE)
+ return;
+
WALInsertLockAcquireExclusive();
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+ Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
XLogCtl->Insert.nonExclusiveBackups--;
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
* later than the start of the dump, and so if we rely on it as the start
* point, we will fail to restore a consistent database state.
*
- * Returns TRUE if a backup_label was found (and fills the checkpoint
+ * Returns true if a backup_label was found (and fills the checkpoint
* location and its REDO location into *checkPointLoc and RedoStartLSN,
- * respectively); returns FALSE if not. If this backup_label came from a
- * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
- * was created during recovery, *backupFromStandby is set to TRUE.
+ * respectively); returns false if not. If this backup_label came from a
+ * streamed backup, *backupEndRequired is set to true. If this backup_label
+ * was created during recovery, *backupFromStandby is set to true.
*/
static bool
read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
bool *backupFromStandby)
{
char startxlogfilename[MAXFNAMELEN];
- TimeLineID tli;
+ TimeLineID tli_from_walseg,
+ tli_from_file;
FILE *lfp;
char ch;
char backuptype[20];
char backupfrom[20];
+ char backuplabel[MAXPGPATH];
+ char backuptime[128];
uint32 hi,
lo;
* format).
*/
if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
- &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
+ &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
*backupFromStandby = true;
}
+ /*
+ * Parse START TIME and LABEL. Those are not mandatory fields for recovery
+ * but checking for their presence is useful for debugging and the next
+ * sanity checks. Cope also with the fact that the result buffers have a
+ * pre-allocated size, hence if the backup_label file has been generated
+ * with strings longer than the maximum assumed here an incorrect parsing
+ * happens. That's fine as only minor consistency checks are done
+ * afterwards.
+ */
+ if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
+ ereport(DEBUG1,
+ (errmsg("backup time %s in file \"%s\"",
+ backuptime, BACKUP_LABEL_FILE)));
+
+ if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
+ ereport(DEBUG1,
+ (errmsg("backup label %s in file \"%s\"",
+ backuplabel, BACKUP_LABEL_FILE)));
+
+ /*
+ * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
+ * it as a sanity check if present.
+ */
+ if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
+ {
+ if (tli_from_walseg != tli_from_file)
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
+ errdetail("Timeline ID parsed is %u, but expected %u",
+ tli_from_file, tli_from_walseg)));
+
+ ereport(DEBUG1,
+ (errmsg("backup timeline %u in file \"%s\"",
+ tli_from_file, BACKUP_LABEL_FILE)));
+ }
+
if (ferror(lfp) || FreeFile(lfp))
ereport(FATAL,
(errcode_for_file_access(),
* recovering from a backup dump file, and we therefore need to create symlinks
* as per the information present in tablespace_map file.
*
- * Returns TRUE if a tablespace_map file was found (and fills the link
- * information for all the tablespace links present in file); returns FALSE
+ * Returns true if a tablespace_map file was found (and fills the link
+ * information for all the tablespace links present in file); returns false
* if not.
*/
static bool
if (sscanf(str, "%s %n", tbsoid, &n) != 1)
ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+ errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
tbslinkpath = str + n;
i = 0;
(XLogPageReadPrivate *) xlogreader->private_data;
int emode = private->emode;
uint32 targetPageOff;
- XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+ XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+ int r;
- XLByteToSeg(targetPagePtr, targetSegNo);
- targetPageOff = targetPagePtr % XLogSegSize;
+ XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
+ targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
/*
* See if we need to switch to a new segment because the requested record
* is not in the currently open one.
*/
- if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
+ if (readFile >= 0 &&
+ !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
{
/*
* Request a restartpoint if we've replayed too much xlog since the
readSource = 0;
}
- XLByteToSeg(targetPagePtr, readSegNo);
+ XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
retry:
/* See if we need to retrieve more data */
if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
readLen = XLOG_BLCKSZ;
else
- readLen = receivedUpto % XLogSegSize - targetPageOff;
+ readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
+ targetPageOff;
}
else
readLen = XLOG_BLCKSZ;
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
{
char fname[MAXFNAMELEN];
+ int save_errno = errno;
- XLogFileName(fname, curFileTLI, readSegNo);
+ XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+ errno = save_errno;
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
errmsg("could not seek in log segment %s to offset %u: %m",
goto next_record_is_invalid;
}
- if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+ r = read(readFile, readBuf, XLOG_BLCKSZ);
+ if (r != XLOG_BLCKSZ)
{
char fname[MAXFNAMELEN];
+ int save_errno = errno;
- XLogFileName(fname, curFileTLI, readSegNo);
- ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
- (errcode_for_file_access(),
- errmsg("could not read from log segment %s, offset %u: %m",
- fname, readOff)));
+ pgstat_report_wait_end();
+ XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+ if (r < 0)
+ {
+ errno = save_errno;
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode_for_file_access(),
+ errmsg("could not read from log segment %s, offset %u: %m",
+ fname, readOff)));
+ }
+ else
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+ fname, readOff, r, (Size) XLOG_BLCKSZ)));
goto next_record_is_invalid;
}
+ pgstat_report_wait_end();
Assert(targetSegNo == readSegNo);
Assert(targetPageOff == readOff);
Assert(reqLen <= readLen);
*readTLI = curFileTLI;
+
+ /*
+ * Check the page header immediately, so that we can retry immediately if
+ * it's not valid. This may seem unnecessary, because XLogReadRecord()
+ * validates the page header anyway, and would propagate the failure up to
+ * ReadRecord(), which would retry. However, there's a corner case with
+ * continuation records, if a record is split across two pages such that
+ * we would need to read the two pages from different sources. For
+ * example, imagine a scenario where a streaming replica is started up,
+ * and replay reaches a record that's split across two WAL segments. The
+ * first page is only available locally, in pg_wal, because it's already
+ * been recycled in the master. The second page, however, is not present
+ * in pg_wal, and we should stream it from the master. There is a recycled
+ * WAL segment present in pg_wal, with garbage contents, however. We would
+ * read the first page from the local WAL segment, but when reading the
+ * second page, we would read the bogus, recycled, WAL segment. If we
+ * didn't catch that case here, we would never recover, because
+ * ReadRecord() would retry reading the whole record from the beginning.
+ *
+ * Of course, this only catches errors in the page header, which is what
+ * happens in the case of a recycled WAL segment. Other kinds of errors or
+ * corruption still has the same problem. But this at least fixes the
+ * common case, which can happen as part of normal operation.
+ *
+ * Validating the page header is cheap enough that doing it twice
+ * shouldn't be a big deal from a performance point of view.
+ */
+ if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+ {
+ /* reset any error XLogReaderValidatePageHeader() might have set */
+ xlogreader->errormsg_buf[0] = '\0';
+ goto next_record_is_invalid;
+ }
+
return readLen;
next_record_is_invalid:
}
/*
- * Open the WAL segment containing WAL position 'RecPtr'.
+ * Open the WAL segment containing WAL location 'RecPtr'.
*
* The segment can be fetched via restore_command, or via walreceiver having
* streamed the record, or it can already be present in pg_wal. Checking
{
static TimestampTz last_fail_time = 0;
TimestampTz now;
+ bool streaming_reply_sent = false;
/*-------
* Standby mode is implemented by a state machine:
* If primary_conninfo is set, launch walreceiver to try
* to stream the missing WAL.
*
- * If fetching_ckpt is TRUE, RecPtr points to the initial
+ * If fetching_ckpt is true, RecPtr points to the initial
* checkpoint location. In that case, we use RedoStartLSN
* as the streaming start position instead of RecPtr, so
* that when we later jump backwards to start redo at
}
else
{
- ptr = tliRecPtr;
+ ptr = RecPtr;
+
+ /*
+ * Use the record begin position to determine the
+ * TLI, rather than the position we're reading.
+ */
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
if (curFileTLI > 0 && tli < curFileTLI)
elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
- (uint32) (ptr >> 32), (uint32) ptr,
+ (uint32) (tliRecPtr >> 32),
+ (uint32) tliRecPtr,
tli, curFileTLI);
}
curFileTLI = tli;
* little chance that the problem will just go away, but
* PANIC is not good for availability either, especially
* in hot standby mode. So, we treat that the same as
- * disconnection, and retry from archive/pg_wal again.
- * The WAL in the archive should be identical to what was
+ * disconnection, and retry from archive/pg_wal again. The
+ * WAL in the archive should be identical to what was
* streamed, so it's unlikely that it helps, but one can
* hope...
*/
*/
now = GetCurrentTimestamp();
if (!TimestampDifferenceExceeds(last_fail_time, now,
- wal_retrieve_retry_interval))
+ wal_retrieve_retry_interval))
{
long secs,
wait_time;
(secs * 1000 + usecs / 1000);
WaitLatch(&XLogCtl->recoveryWakeupLatch,
- WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
ResetLatch(&XLogCtl->recoveryWakeupLatch);
now = GetCurrentTimestamp();
* file from pg_wal.
*/
readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
- currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+ currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
currentSource);
if (readFile >= 0)
return true; /* success! */
* not open already. Also read the timeline history
* file if we haven't initialized timeline history
* yet; it should be streamed over and present in
- * pg_wal by now. Use XLOG_FROM_STREAM so that
- * source info is set correctly and XLogReceiptTime
- * isn't changed.
+ * pg_wal by now. Use XLOG_FROM_STREAM so that source
+ * info is set correctly and XLogReceiptTime isn't
+ * changed.
*/
if (readFile < 0)
{
break;
}
+ /*
+ * Since we have replayed everything we have received so
+ * far and are about to start waiting for more WAL, let's
+ * tell the upstream server our replay location now so
+ * that pg_stat_replication doesn't show stale
+ * information.
+ */
+ if (!streaming_reply_sent)
+ {
+ WalRcvForceReply();
+ streaming_reply_sent = true;
+ }
+
/*
* Wait for more WAL to arrive. Time out after 5 seconds
* to react to a trigger file promptly.