* anything we saw during replay.
*
* We are able to remove segments no longer necessary by carefully tracking
- * each table's used values: during vacuum, any multixact older than a
- * certain value is removed; the cutoff value is stored in pg_class.
- * The minimum value in each database is stored in pg_database, and the
- * global minimum is part of pg_control. Any vacuum that is able to
- * advance its database's minimum value also computes a new global minimum,
- * and uses this value to truncate older segments. When new multixactid
- * values are to be created, care is taken that the counter does not
- * fall within the wraparound horizon considering the global minimum value.
+ * each table's used values: during vacuum, any multixact older than a certain
+ * value is removed; the cutoff value is stored in pg_class. The minimum value
+ * across all tables in each database is stored in pg_database, and the global
+ * minimum across all databases is part of pg_control and is kept in shared
+ * memory. At checkpoint time, after the value is known flushed in WAL, any
+ * files that correspond to multixacts older than that value are removed.
+ * (These files are also removed when a restartpoint is executed.)
+ *
+ * When new multixactid values are to be created, care is taken that the
+ * counter does not fall within the wraparound horizon considering the global
+ * minimum value.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
* MultiXact page numbering also wraps around at
* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
* take no explicit notice of that fact in this module, except when comparing
* segment and page numbers in TruncateMultiXact (see
* MultiXactOffsetPagePrecedes).
/* next-to-be-assigned offset */
MultiXactOffset nextOffset;
- /* the Offset SLRU area was last truncated at this MultiXactId */
- MultiXactId lastTruncationPoint;
-
/*
- * oldest multixact that is still on disk. Anything older than this
- * should not be consulted.
+ * Oldest multixact that is still on disk. Anything older than this
+ * should not be consulted. These values are updated by vacuum.
*/
MultiXactId oldestMultiXactId;
Oid oldestMultiXactDB;
+ /*
+ * This is what the previous checkpoint stored as the truncate position.
+ * This value is the oldestMultiXactId that was valid when a checkpoint
+ * was last executed.
+ */
+ MultiXactId lastCheckpointedOldest;
+
/* support for anti-wraparound measures */
MultiXactId multiVacLimit;
MultiXactId multiWarnLimit;
* than its own OldestVisibleMXactId[] setting; this is necessary because
* the checkpointer could truncate away such data at any instant.
*
- * The checkpointer can compute the safe truncation point as the oldest
- * valid value among all the OldestMemberMXactId[] and
- * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
- * Clearly, it is not possible for any later-computed OldestVisibleMXactId
- * value to be older than this, and so there is no risk of truncating data
- * that is still needed.
+ * The oldest valid value among all of the OldestMemberMXactId[] and
+ * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
+ * possible value still having any live member transaction. Subtracting
+ * vacuum_multixact_freeze_min_age from that value we obtain the freezing
+ * point for multixacts for that table. Any value older than that is
+ * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note
+ * that multis that have member xids that are older than the cutoff point
+ * for xids must also be frozen, even if the multis themselves are newer
+ * than the multixid cutoff point). Whenever a full table vacuum happens,
+ * the freezing point so computed is used as the new pg_class.relminmxid
+ * value. The minimum of all those values in a database is stored as
+ * pg_database.datminmxid. In turn, the minimum of all of those values is
+ * stored in pg_control and used as truncation point for pg_multixact. At
+ * checkpoint or restartpoint, unneeded segments are removed.
*/
MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */
} MultiXactStateData;
* We check known limits on MultiXact before resorting to the SLRU area.
*
* An ID older than MultiXactState->oldestMultiXactId cannot possibly be
- * useful; it should have already been removed by vacuum. We've truncated
- * the on-disk structures anyway. Returning the wrong values could lead
+ * useful; it has already been removed, or will be removed shortly, by
+ * truncation. Returning the wrong values could lead
* to an incorrect visibility result. However, to support pg_upgrade we
* need to allow an empty set to be returned regardless, if the caller is
* willing to accept it; the caller is expected to check that it's an
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
/*
- * (Re-)Initialize our idea of the latest page number.
+ * (Re-)Initialize our idea of the latest page number for offsets.
*/
pageno = MultiXactIdToOffsetPage(multi);
MultiXactOffsetCtl->shared->latest_page_number = pageno;
/*
* Zero out the remainder of the current offsets page. See notes in
- * StartupCLOG() for motivation.
+ * TrimCLOG() for motivation.
*/
entryno = MultiXactIdToOffsetEntry(multi);
if (entryno != 0)
LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
- * (Re-)Initialize our idea of the latest page number.
+ * (Re-)Initialize our idea of the latest page number for members.
*/
pageno = MXOffsetToMemberPage(offset);
MultiXactMemberCtl->shared->latest_page_number = pageno;
SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
}
+/*
+ * Update the "safe truncation point". This is the newest value of oldestMulti
+ * that is known to be flushed as part of a checkpoint record.
+ */
+void
+MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti)
+{
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->lastCheckpointedOldest = safeTruncateMulti;
+ LWLockRelease(MultiXactGenLock);
+}
+
/*
* Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
*
* Remove all MultiXactOffset and MultiXactMember segments before the oldest
* ones still of interest.
*
- * On a primary, this is called by vacuum after it has successfully advanced a
- * database's datminmxid value; the cutoff value we're passed is the minimum of
- * all databases' datminmxid values.
- *
- * During crash recovery, it's called from CreateRestartPoint() instead. We
- * rely on the fact that xlog_redo() will already have called
- * MultiXactAdvanceOldest(). Our latest_page_number will already have been
- * initialized by StartupMultiXact() and kept up to date as new pages are
- * zeroed.
+ * On a primary, this is called by the checkpointer process after a checkpoint
+ * has been flushed; during crash recovery, it's called from
+ * CreateRestartPoint(). In the latter case, we rely on the fact that
+ * xlog_redo() will already have called MultiXactAdvanceOldest(). Our
+ * latest_page_number will already have been initialized by StartupMultiXact()
+ * and kept up to date as new pages are zeroed.
*/
void
-TruncateMultiXact(MultiXactId oldestMXact)
+TruncateMultiXact(void)
{
+ MultiXactId oldestMXact;
MultiXactOffset oldestOffset;
MultiXactOffset nextOffset;
mxtruncinfo trunc;
MultiXactId earliest;
MembersLiveRange range;
+ Assert(AmCheckpointerProcess() || AmStartupProcess() ||
+ !IsPostmasterEnvironment);
+
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ oldestMXact = MultiXactState->lastCheckpointedOldest;
+ LWLockRelease(MultiXactGenLock);
+ Assert(MultiXactIdIsValid(oldestMXact));
+
/*
* Note we can't just plow ahead with the truncation; it's possible that
* there are no segments to truncate, which is a problem because we are
trunc.earliestExistingPage = -1;
SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+ if (earliest < FirstMultiXactId)
+ earliest = FirstMultiXactId;
/* nothing to do */
if (MultiXactIdPrecedes(oldestMXact, earliest))
/*
* First, compute the safe truncation point for MultiXactMember. This is
- * the starting offset of the multixact we were passed as MultiXactOffset
- * cutoff.
+ * the starting offset of the oldest multixact.
*/
{
int pageno;
LWLockRelease(MultiXactOffsetControlLock);
}
- /* truncate MultiXactOffset */
- SimpleLruTruncate(MultiXactOffsetCtl,
- MultiXactIdToOffsetPage(oldestMXact));
-
/*
* To truncate MultiXactMembers, we need to figure out the active page
* range and delete all files outside that range. The start point is the
range.rangeEnd = MXOffsetToMemberPage(nextOffset);
SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers, &range);
+
+ /* Now we can truncate MultiXactOffset */
+ SimpleLruTruncate(MultiXactOffsetCtl,
+ MultiXactIdToOffsetPage(oldestMXact));
+
}
/*
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
XLogCtl->ckptXid = checkPoint.nextXid;
*/
END_CRIT_SECTION();
+ /*
+ * Now that the checkpoint is safely on disk, we can update the point to
+ * which multixact can be truncated.
+ */
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
+
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
if (!RecoveryInProgress())
TruncateSUBTRANS(GetOldestXmin(NULL, false));
+ /*
+ * Truncate pg_multixact too.
+ */
+ TruncateMultiXact();
+
/* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false);
}
LWLockRelease(ControlFileLock);
- /*
- * Due to an historical accident multixact truncations are not WAL-logged,
- * but just performed everytime the mxact horizon is increased. So, unless
- * we explicitly execute truncations on a standby it will never clean out
- * /pg_multixact which obviously is bad, both because it uses space and
- * because we can wrap around into pre-existing data...
- *
- * We can only do the truncation here, after the UpdateControlFile()
- * above, because we've now safely established a restart point, that
- * guarantees we will not need need to access those multis.
- *
- * It's probably worth improving this.
- */
- TruncateMultiXact(lastCheckPoint.oldestMulti);
-
/*
* Delete old log files (those no longer needed even for previous
* checkpoint/restartpoint) to prevent the disk holding the xlog from
ThisTimeLineID = 0;
}
+ /*
+ * Due to an historical accident multixact truncations are not WAL-logged,
+ * but just performed everytime the mxact horizon is increased. So, unless
+ * we explicitly execute truncations on a standby it will never clean out
+ * /pg_multixact which obviously is bad, both because it uses space and
+ * because we can wrap around into pre-existing data...
+ *
+ * We can only do the truncation here, after the UpdateControlFile()
+ * above, because we've now safely established a restart point. That
+ * guarantees we will not need to access those multis.
+ *
+ * It's probably worth improving this.
+ */
+ TruncateMultiXact();
+
/*
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
checkPoint.oldestXidDB);
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;