1 /*-------------------------------------------------------------------------
4 * PostgreSQL multi-transaction-log manager
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
62 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
65 * src/backend/access/transam/multixact.c
67 *-------------------------------------------------------------------------
71 #include "access/multixact.h"
72 #include "access/slru.h"
73 #include "access/transam.h"
74 #include "access/twophase.h"
75 #include "access/twophase_rmgr.h"
76 #include "access/xact.h"
77 #include "access/xlog.h"
78 #include "access/xloginsert.h"
79 #include "catalog/pg_type.h"
80 #include "commands/dbcommands.h"
82 #include "lib/ilist.h"
83 #include "miscadmin.h"
85 #include "postmaster/autovacuum.h"
86 #include "storage/lmgr.h"
87 #include "storage/pmsignal.h"
88 #include "storage/proc.h"
89 #include "storage/procarray.h"
90 #include "utils/builtins.h"
91 #include "utils/memutils.h"
92 #include "utils/snapmgr.h"
96 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
97 * used everywhere else in Postgres.
99 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
100 * MultiXact page numbering also wraps around at
101 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
102 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
103 * take no explicit notice of that fact in this module, except when comparing
104 * segment and page numbers in TruncateMultiXact (see
105 * MultiXactOffsetPagePrecedes).
108 /* We need four bytes per offset */
109 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
111 #define MultiXactIdToOffsetPage(xid) \
112 ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
113 #define MultiXactIdToOffsetEntry(xid) \
114 ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
115 #define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)
118 * The situation for members is a bit more complex: we store one byte of
119 * additional flag bits for each TransactionId. To do this without getting
120 * into alignment issues, we store four bytes of flags, and then the
121 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
122 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
123 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
124 * performance) trumps space efficiency here.
126 * Note that the "offset" macros work with byte offset, not array indexes, so
127 * arithmetic must be done using "char *" pointers.
129 /* We need eight bits per xact, so one xact fits in a byte */
130 #define MXACT_MEMBER_BITS_PER_XACT 8
131 #define MXACT_MEMBER_FLAGS_PER_BYTE 1
132 #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
134 /* how many full bytes of flags are there in a group? */
135 #define MULTIXACT_FLAGBYTES_PER_GROUP 4
136 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
137 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
138 /* size in bytes of a complete group */
139 #define MULTIXACT_MEMBERGROUP_SIZE \
140 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
141 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
142 #define MULTIXACT_MEMBERS_PER_PAGE \
143 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
146 * Because the number of items per page is not a divisor of the last item
147 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
148 * of pages, and moreover the last used page therein does not use the same
149 * number of items as previous pages. (Another way to say it is that the
150 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
151 * has some empty space after that item.)
153 * This constant is the number of members in the last page of the last segment.
155 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
156 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
158 /* page in which a member is to be found */
159 #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
160 #define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)
162 /* Location (byte offset within page) of flag word for a given member */
163 #define MXOffsetToFlagsOffset(xid) \
164 ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
165 (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
166 (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
167 #define MXOffsetToFlagsBitShift(xid) \
168 (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
169 MXACT_MEMBER_BITS_PER_XACT)
171 /* Location (byte offset within page) of TransactionId of given member */
172 #define MXOffsetToMemberOffset(xid) \
173 (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
174 ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
176 /* Multixact members wraparound thresholds. */
177 #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
178 #define MULTIXACT_MEMBER_DANGER_THRESHOLD \
179 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
181 #define PreviousMultiXactId(xid) \
182 ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1)
185 * Links to shared-memory data structures for MultiXact control
187 static SlruCtlData MultiXactOffsetCtlData;
188 static SlruCtlData MultiXactMemberCtlData;
190 #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
191 #define MultiXactMemberCtl (&MultiXactMemberCtlData)
194 * MultiXact state shared across all backends. All this state is protected
195 * by MultiXactGenLock. (We also use MultiXactOffsetControlLock and
196 * MultiXactMemberControlLock to guard accesses to the two sets of SLRU
197 * buffers. For concurrency's sake, we avoid holding more than one of these
200 typedef struct MultiXactStateData
202 /* next-to-be-assigned MultiXactId */
203 MultiXactId nextMXact;
205 /* next-to-be-assigned offset */
206 MultiXactOffset nextOffset;
208 /* Have we completed multixact startup? */
209 bool finishedStartup;
212 * Oldest multixact that is still potentially referenced by a relation.
213 * Anything older than this should not be consulted. These values are
216 MultiXactId oldestMultiXactId;
217 Oid oldestMultiXactDB;
220 * Oldest multixact offset that is potentially referenced by a multixact
221 * referenced by a relation. We don't always know this value, so there's
222 * a flag here to indicate whether or not we currently do.
224 MultiXactOffset oldestOffset;
225 bool oldestOffsetKnown;
227 /* support for anti-wraparound measures */
228 MultiXactId multiVacLimit;
229 MultiXactId multiWarnLimit;
230 MultiXactId multiStopLimit;
231 MultiXactId multiWrapLimit;
233 /* support for members anti-wraparound measures */
234 MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
237 * Per-backend data starts here. We have two arrays stored in the area
238 * immediately following the MultiXactStateData struct. Each is indexed by
241 * In both arrays, there's a slot for all normal backends (1..MaxBackends)
242 * followed by a slot for max_prepared_xacts prepared transactions. Valid
243 * BackendIds start from 1; element zero of each array is never used.
245 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
246 * transaction(s) could possibly be a member of, or InvalidMultiXactId
247 * when the backend has no live transaction that could possibly be a
248 * member of a MultiXact. Each backend sets its entry to the current
249 * nextMXact counter just before first acquiring a shared lock in a given
250 * transaction, and clears it at transaction end. (This works because only
251 * during or after acquiring a shared lock could an XID possibly become a
252 * member of a MultiXact, and that MultiXact would have to be created
253 * during or after the lock acquisition.)
255 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
256 * current transaction(s) think is potentially live, or InvalidMultiXactId
257 * when not in a transaction or not in a transaction that's paid any
258 * attention to MultiXacts yet. This is computed when first needed in a
259 * given transaction, and cleared at transaction end. We can compute it
260 * as the minimum of the valid OldestMemberMXactId[] entries at the time
261 * we compute it (using nextMXact if none are valid). Each backend is
262 * required not to attempt to access any SLRU data for MultiXactIds older
263 * than its own OldestVisibleMXactId[] setting; this is necessary because
264 * the checkpointer could truncate away such data at any instant.
266 * The oldest valid value among all of the OldestMemberMXactId[] and
267 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
268 * possible value still having any live member transaction. Subtracting
269 * vacuum_multixact_freeze_min_age from that value we obtain the freezing
270 * point for multixacts for that table. Any value older than that is
271 * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note
272 * that multis that have member xids that are older than the cutoff point
273 * for xids must also be frozen, even if the multis themselves are newer
274 * than the multixid cutoff point). Whenever a full table vacuum happens,
275 * the freezing point so computed is used as the new pg_class.relminmxid
276 * value. The minimum of all those values in a database is stored as
277 * pg_database.datminmxid. In turn, the minimum of all of those values is
278 * stored in pg_control and used as truncation point for pg_multixact. At
279 * checkpoint or restartpoint, unneeded segments are removed.
281 MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
282 } MultiXactStateData;
285 * Last element of OldestMemberMXactId and OldestVisibleMXactId arrays.
286 * Valid elements are (1..MaxOldestSlot); element 0 is never used.
288 #define MaxOldestSlot (MaxBackends + max_prepared_xacts)
290 /* Pointers to the state data in shared memory */
291 static MultiXactStateData *MultiXactState;
292 static MultiXactId *OldestMemberMXactId;
293 static MultiXactId *OldestVisibleMXactId;
297 * Definitions for the backend-local MultiXactId cache.
299 * We use this cache to store known MultiXacts, so we don't need to go to
300 * SLRU areas every time.
302 * The cache lasts for the duration of a single transaction, the rationale
303 * for this being that most entries will contain our own TransactionId and
304 * so they will be uninteresting by the time our next transaction starts.
305 * (XXX not clear that this is correct --- other members of the MultiXact
306 * could hang around longer than we did. However, it's not clear what a
307 * better policy for flushing old cache entries would be.) FIXME actually
308 * this is plain wrong now that multixact's may contain update Xids.
310 * We allocate the cache entries in a memory context that is deleted at
311 * transaction end, so we don't need to do retail freeing of entries.
313 typedef struct mXactCacheEnt
318 MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
321 #define MAX_CACHE_ENTRIES 256
322 static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
323 static int MXactCacheMembers = 0;
324 static MemoryContext MXactContext = NULL;
326 #ifdef MULTIXACT_DEBUG
327 #define debug_elog2(a,b) elog(a,b)
328 #define debug_elog3(a,b,c) elog(a,b,c)
329 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
330 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
331 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
333 #define debug_elog2(a,b)
334 #define debug_elog3(a,b,c)
335 #define debug_elog4(a,b,c,d)
336 #define debug_elog5(a,b,c,d,e)
337 #define debug_elog6(a,b,c,d,e,f)
340 /* internal MultiXactId management */
341 static void MultiXactIdSetOldestVisible(void);
342 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
343 int nmembers, MultiXactMember *members);
344 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
346 /* MultiXact cache management */
347 static int mxactMemberComparator(const void *arg1, const void *arg2);
348 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
349 static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
350 static void mXactCachePut(MultiXactId multi, int nmembers,
351 MultiXactMember *members);
353 static char *mxstatus_to_string(MultiXactStatus status);
355 /* management of SLRU infrastructure */
356 static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
357 static int ZeroMultiXactMemberPage(int pageno, bool writeXlog);
358 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
359 static bool MultiXactMemberPagePrecedes(int page1, int page2);
360 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
361 MultiXactOffset offset2);
362 static void ExtendMultiXactOffset(MultiXactId multi);
363 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
364 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
365 MultiXactOffset start, uint32 distance);
366 static bool SetOffsetVacuumLimit(bool is_startup);
367 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
368 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
369 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
370 MultiXactId startTruncOff,
371 MultiXactId endTruncOff,
372 MultiXactOffset startTruncMemb,
373 MultiXactOffset endTruncMemb);
378 * Construct a MultiXactId representing two TransactionIds.
380 * The two XIDs must be different, or be requesting different statuses.
382 * NB - we don't worry about our local MultiXactId cache here, because that
383 * is handled by the lower-level routines.
386 MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
387 TransactionId xid2, MultiXactStatus status2)
389 MultiXactId newMulti;
390 MultiXactMember members[2];
392 AssertArg(TransactionIdIsValid(xid1));
393 AssertArg(TransactionIdIsValid(xid2));
395 Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
397 /* MultiXactIdSetOldestMember() must have been called already. */
398 Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
401 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
402 * are still running. In typical usage, xid2 will be our own XID and the
403 * caller just did a check on xid1, so it'd be wasted effort.
406 members[0].xid = xid1;
407 members[0].status = status1;
408 members[1].xid = xid2;
409 members[1].status = status2;
411 newMulti = MultiXactIdCreateFromMembers(2, members);
413 debug_elog3(DEBUG2, "Create: %s",
414 mxid_to_string(newMulti, 2, members));
421 * Add a TransactionId to a pre-existing MultiXactId.
423 * If the TransactionId is already a member of the passed MultiXactId with the
424 * same status, just return it as-is.
426 * Note that we do NOT actually modify the membership of a pre-existing
427 * MultiXactId; instead we create a new one. This is necessary to avoid
428 * a race condition against code trying to wait for one MultiXactId to finish;
429 * see notes in heapam.c.
431 * NB - we don't worry about our local MultiXactId cache here, because that
432 * is handled by the lower-level routines.
434 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
435 * one upgraded by pg_upgrade from a cluster older than this feature) are not
439 MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
441 MultiXactId newMulti;
442 MultiXactMember *members;
443 MultiXactMember *newMembers;
448 AssertArg(MultiXactIdIsValid(multi));
449 AssertArg(TransactionIdIsValid(xid));
451 /* MultiXactIdSetOldestMember() must have been called already. */
452 Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
454 debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
455 multi, xid, mxstatus_to_string(status));
458 * Note: we don't allow for old multis here. The reason is that the only
459 * caller of this function does a check that the multixact is no longer
462 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
466 MultiXactMember member;
469 * The MultiXactId is obsolete. This can only happen if all the
470 * MultiXactId members stop running between the caller checking and
471 * passing it to us. It would be better to return that fact to the
472 * caller, but it would complicate the API and it's unlikely to happen
473 * too often, so just deal with it by creating a singleton MultiXact.
476 member.status = status;
477 newMulti = MultiXactIdCreateFromMembers(1, &member);
479 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
485 * If the TransactionId is already a member of the MultiXactId with the
486 * same status, just return the existing MultiXactId.
488 for (i = 0; i < nmembers; i++)
490 if (TransactionIdEquals(members[i].xid, xid) &&
491 (members[i].status == status))
493 debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
501 * Determine which of the members of the MultiXactId are still of
502 * interest. This is any running transaction, and also any transaction
503 * that grabbed something stronger than just a lock and was committed. (An
504 * update that aborted is of no interest here; and having more than one
505 * update Xid in a multixact would cause errors elsewhere.)
507 * Removing dead members is not just an optimization: freezing of tuples
508 * whose Xmax are multis depends on this behavior.
510 * Note we have the same race condition here as above: j could be 0 at the
513 newMembers = (MultiXactMember *)
514 palloc(sizeof(MultiXactMember) * (nmembers + 1));
516 for (i = 0, j = 0; i < nmembers; i++)
518 if (TransactionIdIsInProgress(members[i].xid) ||
519 (ISUPDATE_from_mxstatus(members[i].status) &&
520 TransactionIdDidCommit(members[i].xid)))
522 newMembers[j].xid = members[i].xid;
523 newMembers[j++].status = members[i].status;
527 newMembers[j].xid = xid;
528 newMembers[j++].status = status;
529 newMulti = MultiXactIdCreateFromMembers(j, newMembers);
534 debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
540 * MultiXactIdIsRunning
541 * Returns whether a MultiXactId is "running".
543 * We return true if at least one member of the given MultiXactId is still
544 * running. Note that a "false" result is certain not to change,
545 * because it is not legal to add members to an existing MultiXactId.
547 * Caller is expected to have verified that the multixact does not come from
548 * a pg_upgraded share-locked tuple.
551 MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
553 MultiXactMember *members;
557 debug_elog3(DEBUG2, "IsRunning %u?", multi);
560 * "false" here means we assume our callers have checked that the given
561 * multi cannot possibly come from a pg_upgraded database.
563 nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
567 debug_elog2(DEBUG2, "IsRunning: no members");
572 * Checking for myself is cheap compared to looking in shared memory;
573 * return true if any live subtransaction of the current top-level
574 * transaction is a member.
576 * This is not needed for correctness, it's just a fast path.
578 for (i = 0; i < nmembers; i++)
580 if (TransactionIdIsCurrentTransactionId(members[i].xid))
582 debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
589 * This could be made faster by having another entry point in procarray.c,
590 * walking the PGPROC array only once for all the members. But in most
591 * cases nmembers should be small enough that it doesn't much matter.
593 for (i = 0; i < nmembers; i++)
595 if (TransactionIdIsInProgress(members[i].xid))
597 debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
606 debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
612 * MultiXactIdSetOldestMember
613 * Save the oldest MultiXactId this transaction could be a member of.
615 * We set the OldestMemberMXactId for a given transaction the first time it's
616 * going to do some operation that might require a MultiXactId (tuple lock,
617 * update or delete). We need to do this even if we end up using a
618 * TransactionId instead of a MultiXactId, because there is a chance that
619 * another transaction would add our XID to a MultiXactId.
621 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
622 * be called just before doing any such possibly-MultiXactId-able operation.
625 MultiXactIdSetOldestMember(void)
627 if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
629 MultiXactId nextMXact;
632 * You might think we don't need to acquire a lock here, since
633 * fetching and storing of TransactionIds is probably atomic, but in
634 * fact we do: suppose we pick up nextMXact and then lose the CPU for
635 * a long time. Someone else could advance nextMXact, and then
636 * another someone else could compute an OldestVisibleMXactId that
637 * would be after the value we are going to store when we get control
638 * back. Which would be wrong.
640 * Note that a shared lock is sufficient, because it's enough to stop
641 * someone from advancing nextMXact; and nobody else could be trying
642 * to write to our OldestMember entry, only reading (and we assume
643 * storing it is atomic.)
645 LWLockAcquire(MultiXactGenLock, LW_SHARED);
648 * We have to beware of the possibility that nextMXact is in the
649 * wrapped-around state. We don't fix the counter itself here, but we
650 * must be sure to store a valid value in our array entry.
652 nextMXact = MultiXactState->nextMXact;
653 if (nextMXact < FirstMultiXactId)
654 nextMXact = FirstMultiXactId;
656 OldestMemberMXactId[MyBackendId] = nextMXact;
658 LWLockRelease(MultiXactGenLock);
660 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
661 MyBackendId, nextMXact);
666 * MultiXactIdSetOldestVisible
667 * Save the oldest MultiXactId this transaction considers possibly live.
669 * We set the OldestVisibleMXactId for a given transaction the first time
670 * it's going to inspect any MultiXactId. Once we have set this, we are
671 * guaranteed that the checkpointer won't truncate off SLRU data for
672 * MultiXactIds at or after our OldestVisibleMXactId.
674 * The value to set is the oldest of nextMXact and all the valid per-backend
675 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
676 * certain that no subsequent call to MultiXactIdSetOldestMember can set
677 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
678 * there is no live transaction, now or later, that can be a member of any
679 * MultiXactId older than the OldestVisibleMXactId we compute here.
682 MultiXactIdSetOldestVisible(void)
684 if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
686 MultiXactId oldestMXact;
689 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
692 * We have to beware of the possibility that nextMXact is in the
693 * wrapped-around state. We don't fix the counter itself here, but we
694 * must be sure to store a valid value in our array entry.
696 oldestMXact = MultiXactState->nextMXact;
697 if (oldestMXact < FirstMultiXactId)
698 oldestMXact = FirstMultiXactId;
700 for (i = 1; i <= MaxOldestSlot; i++)
702 MultiXactId thisoldest = OldestMemberMXactId[i];
704 if (MultiXactIdIsValid(thisoldest) &&
705 MultiXactIdPrecedes(thisoldest, oldestMXact))
706 oldestMXact = thisoldest;
709 OldestVisibleMXactId[MyBackendId] = oldestMXact;
711 LWLockRelease(MultiXactGenLock);
713 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
714 MyBackendId, oldestMXact);
719 * ReadNextMultiXactId
720 * Return the next MultiXactId to be assigned, but don't allocate it
723 ReadNextMultiXactId(void)
727 /* XXX we could presumably do this without a lock. */
728 LWLockAcquire(MultiXactGenLock, LW_SHARED);
729 mxid = MultiXactState->nextMXact;
730 LWLockRelease(MultiXactGenLock);
732 if (mxid < FirstMultiXactId)
733 mxid = FirstMultiXactId;
739 * MultiXactIdCreateFromMembers
740 * Make a new MultiXactId from the specified set of members
742 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
743 * given TransactionIds as members. Returns the newly created MultiXactId.
745 * NB: the passed members[] array will be sorted in-place.
748 MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
751 MultiXactOffset offset;
752 xl_multixact_create xlrec;
754 debug_elog3(DEBUG2, "Create: %s",
755 mxid_to_string(InvalidMultiXactId, nmembers, members));
758 * See if the same set of members already exists in our cache; if so, just
759 * re-use that MultiXactId. (Note: it might seem that looking in our
760 * cache is insufficient, and we ought to search disk to see if a
761 * duplicate definition already exists. But since we only ever create
762 * MultiXacts containing our own XID, in most cases any such MultiXacts
763 * were in fact created by us, and so will be in our cache. There are
764 * corner cases where someone else added us to a MultiXact without our
765 * knowledge, but it's not worth checking for.)
767 multi = mXactCacheGetBySet(nmembers, members);
768 if (MultiXactIdIsValid(multi))
770 debug_elog2(DEBUG2, "Create: in cache!");
774 /* Verify that there is a single update Xid among the given members. */
777 bool has_update = false;
779 for (i = 0; i < nmembers; i++)
781 if (ISUPDATE_from_mxstatus(members[i].status))
784 elog(ERROR, "new multixact has more than one updating member");
791 * Assign the MXID and offsets range to use, and make sure there is space
792 * in the OFFSETs and MEMBERs files. NB: this routine does
793 * START_CRIT_SECTION().
795 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
796 * that we've called MultiXactIdSetOldestMember here. This is because
797 * this routine is used in some places to create new MultiXactIds of which
798 * the current backend is not a member, notably during freezing of multis
799 * in vacuum. During vacuum, in particular, it would be unacceptable to
800 * keep OldestMulti set, in case it runs for long.
802 multi = GetNewMultiXactId(nmembers, &offset);
804 /* Make an XLOG entry describing the new MXID. */
807 xlrec.nmembers = nmembers;
810 * XXX Note: there's a lot of padding space in MultiXactMember. We could
811 * find a more compact representation of this Xlog record -- perhaps all
812 * the status flags in one XLogRecData, then all the xids in another one?
813 * Not clear that it's worth the trouble though.
816 XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
817 XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
819 (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
821 /* Now enter the information into the OFFSETs and MEMBERs logs */
822 RecordNewMultiXact(multi, offset, nmembers, members);
824 /* Done with critical section */
827 /* Store the new MultiXactId in the local cache, too */
828 mXactCachePut(multi, nmembers, members);
830 debug_elog2(DEBUG2, "Create: all done");
837 * Write info about a new multixact into the offsets and members files
839 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
843 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
844 int nmembers, MultiXactMember *members)
850 MultiXactOffset *offptr;
853 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
855 pageno = MultiXactIdToOffsetPage(multi);
856 entryno = MultiXactIdToOffsetEntry(multi);
859 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
860 * to complain about if there's any I/O error. This is kinda bogus, but
861 * since the errors will always give the full pathname, it should be clear
862 * enough that a MultiXactId is really involved. Perhaps someday we'll
863 * take the trouble to generalize the slru.c error reporting code.
865 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
866 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
871 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
873 /* Exchange our lock */
874 LWLockRelease(MultiXactOffsetControlLock);
876 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
880 for (i = 0; i < nmembers; i++, offset++)
882 TransactionId *memberptr;
889 Assert(members[i].status <= MultiXactStatusUpdate);
891 pageno = MXOffsetToMemberPage(offset);
892 memberoff = MXOffsetToMemberOffset(offset);
893 flagsoff = MXOffsetToFlagsOffset(offset);
894 bshift = MXOffsetToFlagsBitShift(offset);
896 if (pageno != prev_pageno)
898 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
899 prev_pageno = pageno;
902 memberptr = (TransactionId *)
903 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
905 *memberptr = members[i].xid;
907 flagsptr = (uint32 *)
908 (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
910 flagsval = *flagsptr;
911 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
912 flagsval |= (members[i].status << bshift);
913 *flagsptr = flagsval;
915 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
918 LWLockRelease(MultiXactMemberControlLock);
923 * Get the next MultiXactId.
925 * Also, reserve the needed amount of space in the "members" area. The
926 * starting offset of the reserved space is returned in *offset.
928 * This may generate XLOG records for expansion of the offsets and/or members
929 * files. Unfortunately, we have to do that while holding MultiXactGenLock
930 * to avoid race conditions --- the XLOG record for zeroing a page must appear
931 * before any backend can possibly try to store data in that page!
933 * We start a critical section before advancing the shared counters. The
934 * caller must end the critical section after writing SLRU data.
937 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
940 MultiXactOffset nextOffset;
942 debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
944 /* safety check, we should never get this far in a HS standby */
945 if (RecoveryInProgress())
946 elog(ERROR, "cannot assign MultiXactIds during recovery");
948 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
950 /* Handle wraparound of the nextMXact counter */
951 if (MultiXactState->nextMXact < FirstMultiXactId)
952 MultiXactState->nextMXact = FirstMultiXactId;
954 /* Assign the MXID */
955 result = MultiXactState->nextMXact;
958 * Check to see if it's safe to assign another MultiXactId. This protects
959 * against catastrophic data loss due to multixact wraparound. The basic
962 * If we're past multiVacLimit or the safe threshold for member storage
963 * space, or we don't know what the safe threshold for member storage is,
964 * start trying to force autovacuum cycles.
965 * If we're past multiWarnLimit, start issuing warnings.
966 * If we're past multiStopLimit, refuse to create new MultiXactIds.
968 * Note these are pretty much the same protections in GetNewTransactionId.
971 if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
974 * For safety's sake, we release MultiXactGenLock while sending
975 * signals, warnings, etc. This is not so much because we care about
976 * preserving concurrency in this situation, as to avoid any
977 * possibility of deadlock while doing get_database_name(). First,
978 * copy all the shared values we'll need in this path.
980 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
981 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
982 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
983 Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
985 LWLockRelease(MultiXactGenLock);
987 if (IsUnderPostmaster &&
988 !MultiXactIdPrecedes(result, multiStopLimit))
990 char *oldest_datname = get_database_name(oldest_datoid);
993 * Immediately kick autovacuum into action as we're already in
996 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
998 /* complain even if that DB has disappeared */
1001 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1002 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1004 errhint("Execute a database-wide VACUUM in that database.\n"
1005 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1008 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1009 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
1011 errhint("Execute a database-wide VACUUM in that database.\n"
1012 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1016 * To avoid swamping the postmaster with signals, we issue the autovac
1017 * request only once per 64K multis generated. This still gives
1018 * plenty of chances before we get into real trouble.
1020 if (IsUnderPostmaster && (result % 65536) == 0)
1021 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1023 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1025 char *oldest_datname = get_database_name(oldest_datoid);
1027 /* complain even if that DB has disappeared */
1030 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1031 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1032 multiWrapLimit - result,
1034 multiWrapLimit - result),
1035 errhint("Execute a database-wide VACUUM in that database.\n"
1036 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1039 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1040 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1041 multiWrapLimit - result,
1043 multiWrapLimit - result),
1044 errhint("Execute a database-wide VACUUM in that database.\n"
1045 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1048 /* Re-acquire lock and start over */
1049 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1050 result = MultiXactState->nextMXact;
1051 if (result < FirstMultiXactId)
1052 result = FirstMultiXactId;
1055 /* Make sure there is room for the MXID in the file. */
1056 ExtendMultiXactOffset(result);
1059 * Reserve the members space, similarly to above. Also, be careful not to
1060 * return zero as the starting offset for any multixact. See
1061 * GetMultiXactIdMembers() for motivation.
1063 nextOffset = MultiXactState->nextOffset;
1064 if (nextOffset == 0)
1067 nmembers++; /* allocate member slot 0 too */
1070 *offset = nextOffset;
1073 * Protect against overrun of the members space as well, with the
1076 * If we're past offsetStopLimit, refuse to generate more multis.
1077 * If we're close to offsetStopLimit, emit a warning.
1079 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1080 * from offsetStopLimit.
1082 * Note we haven't updated the shared state yet, so if we fail at this
1083 * point, the multixact ID we grabbed can still be used by the next guy.
1085 * Note that there is no point in forcing autovacuum runs here: the
1086 * multixact freeze settings would have to be reduced for that to have any
1090 #define OFFSET_WARN_SEGMENTS 20
1091 if (MultiXactState->oldestOffsetKnown &&
1092 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1095 /* see comment in the corresponding offsets wraparound case */
1096 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1099 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1100 errmsg("multixact \"members\" limit exceeded"),
1101 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1102 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1103 MultiXactState->offsetStopLimit - nextOffset - 1,
1105 MultiXactState->offsetStopLimit - nextOffset - 1),
1106 errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
1107 MultiXactState->oldestMultiXactDB)));
1111 * Check whether we should kick autovacuum into action, to prevent members
1112 * wraparound. NB we use a much larger window to trigger autovacuum than
1113 * just the warning limit. The warning is just a measure of last resort -
1114 * this is in line with GetNewTransactionId's behaviour.
1116 if (!MultiXactState->oldestOffsetKnown ||
1117 (MultiXactState->nextOffset - MultiXactState->oldestOffset
1118 > MULTIXACT_MEMBER_SAFE_THRESHOLD))
1121 * To avoid swamping the postmaster with signals, we issue the autovac
1122 * request only when crossing a segment boundary. With default
1123 * compilation settings that's roughly after 50k members. This still
1124 * gives plenty of chances before we get into real trouble.
1126 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1127 (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1128 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1131 if (MultiXactState->oldestOffsetKnown &&
1132 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1134 nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1136 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1137 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1138 "database with OID %u must be vacuumed before %d more multixact members are used",
1139 MultiXactState->offsetStopLimit - nextOffset + nmembers,
1140 MultiXactState->oldestMultiXactDB,
1141 MultiXactState->offsetStopLimit - nextOffset + nmembers),
1142 errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.")));
1144 ExtendMultiXactMember(nextOffset, nmembers);
1147 * Critical section from here until caller has written the data into the
1148 * just-reserved SLRU space; we don't want to error out with a partly
1149 * written MultiXact structure. (In particular, failing to write our
1150 * start offset after advancing nextMXact would effectively corrupt the
1151 * previous MultiXact.)
1153 START_CRIT_SECTION();
1156 * Advance counters. As in GetNewTransactionId(), this must not happen
1157 * until after file extension has succeeded!
1159 * We don't care about MultiXactId wraparound here; it will be handled by
1160 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1161 * or the first value on a segment-beginning page after this routine
1162 * exits, so anyone else looking at the variable must be prepared to deal
1163 * with either case. Similarly, nextOffset may be zero, but we won't use
1164 * that as the actual start offset of the next multixact.
1166 (MultiXactState->nextMXact)++;
1168 MultiXactState->nextOffset += nmembers;
1170 LWLockRelease(MultiXactGenLock);
1172 debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1177 * GetMultiXactIdMembers
1178 * Return the set of MultiXactMembers that make up a MultiXactId
1180 * Return value is the number of members found, or -1 if there are none,
1181 * and *members is set to a newly palloc'ed array of members. It's the
1182 * caller's responsibility to free it when done with it.
1184 * from_pgupgrade must be passed as true if and only if only the multixact
1185 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1186 * installation and later pg_upgrade'd (that is, the infomask is
1187 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1188 * can still be running, so we return -1 just like for an empty multixact
1189 * without any further checking. It would be wrong to try to resolve such a
1190 * multixact: either the multixact is within the current valid multixact
1191 * range, in which case the returned result would be bogus, or outside that
1192 * range, in which case an error would be raised.
1194 * In all other cases, the passed multixact must be within the known valid
1195 * range, that is, greater to or equal than oldestMultiXactId, and less than
1196 * nextMXact. Otherwise, an error is raised.
1198 * onlyLock must be set to true if caller is certain that the given multi
1199 * is used only to lock tuples; can be false without loss of correctness,
1200 * but passing a true means we can return quickly without checking for
1204 GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1205 bool from_pgupgrade, bool onlyLock)
1211 MultiXactOffset *offptr;
1212 MultiXactOffset offset;
1216 MultiXactId oldestMXact;
1217 MultiXactId nextMXact;
1218 MultiXactId tmpMXact;
1219 MultiXactOffset nextOffset;
1220 MultiXactMember *ptr;
1222 debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1224 if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1227 /* See if the MultiXactId is in the local cache */
1228 length = mXactCacheGetById(multi, members);
1231 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1232 mxid_to_string(multi, length, *members));
1236 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1237 MultiXactIdSetOldestVisible();
1240 * If we know the multi is used only for locking and not for updates, then
1241 * we can skip checking if the value is older than our oldest visible
1242 * multi. It cannot possibly still be running.
1245 MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
1247 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1253 * We check known limits on MultiXact before resorting to the SLRU area.
1255 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1256 * useful; it has already been removed, or will be removed shortly, by
1257 * truncation. If one is passed, an error is raised.
1259 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1260 * implies undetected ID wraparound has occurred. This raises a hard
1263 * Shared lock is enough here since we aren't modifying any global state.
1264 * Acquire it just long enough to grab the current counter values. We may
1265 * need both nextMXact and nextOffset; see below.
1267 LWLockAcquire(MultiXactGenLock, LW_SHARED);
1269 oldestMXact = MultiXactState->oldestMultiXactId;
1270 nextMXact = MultiXactState->nextMXact;
1271 nextOffset = MultiXactState->nextOffset;
1273 LWLockRelease(MultiXactGenLock);
1275 if (MultiXactIdPrecedes(multi, oldestMXact))
1278 (errcode(ERRCODE_INTERNAL_ERROR),
1279 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1284 if (!MultiXactIdPrecedes(multi, nextMXact))
1286 (errcode(ERRCODE_INTERNAL_ERROR),
1287 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1291 * Find out the offset at which we need to start reading MultiXactMembers
1292 * and the number of members in the multixact. We determine the latter as
1293 * the difference between this multixact's starting offset and the next
1294 * one's. However, there are some corner cases to worry about:
1296 * 1. This multixact may be the latest one created, in which case there is
1297 * no next one to look at. In this case the nextOffset value we just
1298 * saved is the correct endpoint.
1300 * 2. The next multixact may still be in process of being filled in: that
1301 * is, another process may have done GetNewMultiXactId but not yet written
1302 * the offset entry for that ID. In that scenario, it is guaranteed that
1303 * the offset entry for that multixact exists (because GetNewMultiXactId
1304 * won't release MultiXactGenLock until it does) but contains zero
1305 * (because we are careful to pre-zero offset pages). Because
1306 * GetNewMultiXactId will never return zero as the starting offset for a
1307 * multixact, when we read zero as the next multixact's offset, we know we
1308 * have this case. We sleep for a bit and try again.
1310 * 3. Because GetNewMultiXactId increments offset zero to offset one to
1311 * handle case #2, there is an ambiguity near the point of offset
1312 * wraparound. If we see next multixact's offset is one, is that our
1313 * multixact's actual endpoint, or did it end at zero with a subsequent
1314 * increment? We handle this using the knowledge that if the zero'th
1315 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1316 * transaction ID so it can't be a multixact member. Therefore, if we
1317 * read a zero from the members array, just ignore it.
1319 * This is all pretty messy, but the mess occurs only in infrequent corner
1320 * cases, so it seems better than holding the MultiXactGenLock for a long
1321 * time on every multixact creation.
1324 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1326 pageno = MultiXactIdToOffsetPage(multi);
1327 entryno = MultiXactIdToOffsetEntry(multi);
1329 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1330 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1334 Assert(offset != 0);
1337 * Use the same increment rule as GetNewMultiXactId(), that is, don't
1338 * handle wraparound explicitly until needed.
1340 tmpMXact = multi + 1;
1342 if (nextMXact == tmpMXact)
1344 /* Corner case 1: there is no next multixact */
1345 length = nextOffset - offset;
1349 MultiXactOffset nextMXOffset;
1351 /* handle wraparound if needed */
1352 if (tmpMXact < FirstMultiXactId)
1353 tmpMXact = FirstMultiXactId;
1355 prev_pageno = pageno;
1357 pageno = MultiXactIdToOffsetPage(tmpMXact);
1358 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1360 if (pageno != prev_pageno)
1361 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1363 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1365 nextMXOffset = *offptr;
1367 if (nextMXOffset == 0)
1369 /* Corner case 2: next multixact is still being filled in */
1370 LWLockRelease(MultiXactOffsetControlLock);
1371 CHECK_FOR_INTERRUPTS();
1376 length = nextMXOffset - offset;
1379 LWLockRelease(MultiXactOffsetControlLock);
1381 ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1384 /* Now get the members themselves. */
1385 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1389 for (i = 0; i < length; i++, offset++)
1391 TransactionId *xactptr;
1397 pageno = MXOffsetToMemberPage(offset);
1398 memberoff = MXOffsetToMemberOffset(offset);
1400 if (pageno != prev_pageno)
1402 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1403 prev_pageno = pageno;
1406 xactptr = (TransactionId *)
1407 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1409 if (!TransactionIdIsValid(*xactptr))
1411 /* Corner case 3: we must be looking at unused slot zero */
1412 Assert(offset == 0);
1416 flagsoff = MXOffsetToFlagsOffset(offset);
1417 bshift = MXOffsetToFlagsBitShift(offset);
1418 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1420 ptr[truelength].xid = *xactptr;
1421 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1425 LWLockRelease(MultiXactMemberControlLock);
1428 * Copy the result into the local cache.
1430 mXactCachePut(multi, truelength, ptr);
1432 debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1433 mxid_to_string(multi, truelength, ptr));
1438 * mxactMemberComparator
1439 * qsort comparison function for MultiXactMember
1441 * We can't use wraparound comparison for XIDs because that does not respect
1442 * the triangle inequality! Any old sort order will do.
1445 mxactMemberComparator(const void *arg1, const void *arg2)
1447 MultiXactMember member1 = *(const MultiXactMember *) arg1;
1448 MultiXactMember member2 = *(const MultiXactMember *) arg2;
1450 if (member1.xid > member2.xid)
1452 if (member1.xid < member2.xid)
1454 if (member1.status > member2.status)
1456 if (member1.status < member2.status)
1462 * mXactCacheGetBySet
1463 * returns a MultiXactId from the cache based on the set of
1464 * TransactionIds that compose it, or InvalidMultiXactId if
1467 * This is helpful, for example, if two transactions want to lock a huge
1468 * table. By using the cache, the second will use the same MultiXactId
1469 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1470 * both I/O and wraparound issues).
1472 * NB: the passed members array will be sorted in-place.
1475 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1479 debug_elog3(DEBUG2, "CacheGet: looking for %s",
1480 mxid_to_string(InvalidMultiXactId, nmembers, members));
1482 /* sort the array so comparison is easy */
1483 qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1485 dlist_foreach(iter, &MXactCache)
1487 mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1489 if (entry->nmembers != nmembers)
1493 * We assume the cache entries are sorted, and that the unused bits in
1494 * "status" are zeroed.
1496 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1498 debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1499 dlist_move_head(&MXactCache, iter.cur);
1500 return entry->multi;
1504 debug_elog2(DEBUG2, "CacheGet: not found :-(");
1505 return InvalidMultiXactId;
1510 * returns the composing MultiXactMember set from the cache for a
1511 * given MultiXactId, if present.
1513 * If successful, *xids is set to the address of a palloc'd copy of the
1514 * MultiXactMember set. Return value is number of members, or -1 on failure.
1517 mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1521 debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1523 dlist_foreach(iter, &MXactCache)
1525 mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1527 if (entry->multi == multi)
1529 MultiXactMember *ptr;
1532 size = sizeof(MultiXactMember) * entry->nmembers;
1533 ptr = (MultiXactMember *) palloc(size);
1536 memcpy(ptr, entry->members, size);
1538 debug_elog3(DEBUG2, "CacheGet: found %s",
1539 mxid_to_string(multi,
1544 * Note we modify the list while not using a modifiable iterator.
1545 * This is acceptable only because we exit the iteration
1546 * immediately afterwards.
1548 dlist_move_head(&MXactCache, iter.cur);
1550 return entry->nmembers;
1554 debug_elog2(DEBUG2, "CacheGet: not found");
1560 * Add a new MultiXactId and its composing set into the local cache.
1563 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1565 mXactCacheEnt *entry;
1567 debug_elog3(DEBUG2, "CachePut: storing %s",
1568 mxid_to_string(multi, nmembers, members));
1570 if (MXactContext == NULL)
1572 /* The cache only lives as long as the current transaction */
1573 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1574 MXactContext = AllocSetContextCreate(TopTransactionContext,
1575 "MultiXact cache context",
1576 ALLOCSET_SMALL_SIZES);
1579 entry = (mXactCacheEnt *)
1580 MemoryContextAlloc(MXactContext,
1581 offsetof(mXactCacheEnt, members) +
1582 nmembers * sizeof(MultiXactMember));
1584 entry->multi = multi;
1585 entry->nmembers = nmembers;
1586 memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1588 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1589 qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1591 dlist_push_head(&MXactCache, &entry->node);
1592 if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES)
1595 mXactCacheEnt *entry;
1597 node = dlist_tail_node(&MXactCache);
1599 MXactCacheMembers--;
1601 entry = dlist_container(mXactCacheEnt, node, node);
1602 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1610 mxstatus_to_string(MultiXactStatus status)
1614 case MultiXactStatusForKeyShare:
1616 case MultiXactStatusForShare:
1618 case MultiXactStatusForNoKeyUpdate:
1619 return "fornokeyupd";
1620 case MultiXactStatusForUpdate:
1622 case MultiXactStatusNoKeyUpdate:
1624 case MultiXactStatusUpdate:
1627 elog(ERROR, "unrecognized multixact status %d", status);
1633 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1635 static char *str = NULL;
1642 initStringInfo(&buf);
1644 appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1645 mxstatus_to_string(members[0].status));
1647 for (i = 1; i < nmembers; i++)
1648 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1649 mxstatus_to_string(members[i].status));
1651 appendStringInfoChar(&buf, ']');
1652 str = MemoryContextStrdup(TopMemoryContext, buf.data);
1658 * AtEOXact_MultiXact
1659 * Handle transaction end for MultiXact
1661 * This is called at top transaction commit or abort (we don't care which).
1664 AtEOXact_MultiXact(void)
1667 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1668 * which should only be valid while within a transaction.
1670 * We assume that storing a MultiXactId is atomic and so we need not take
1671 * MultiXactGenLock to do this.
1673 OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1674 OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1677 * Discard the local MultiXactId cache. Since MXactContext was created as
1678 * a child of TopTransactionContext, we needn't delete it explicitly.
1680 MXactContext = NULL;
1681 dlist_init(&MXactCache);
1682 MXactCacheMembers = 0;
1686 * AtPrepare_MultiXact
1687 * Save multixact state at 2PC transaction prepare
1689 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1693 AtPrepare_MultiXact(void)
1695 MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
1697 if (MultiXactIdIsValid(myOldestMember))
1698 RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1699 &myOldestMember, sizeof(MultiXactId));
1703 * PostPrepare_MultiXact
1704 * Clean up after successful PREPARE TRANSACTION
1707 PostPrepare_MultiXact(TransactionId xid)
1709 MultiXactId myOldestMember;
1712 * Transfer our OldestMemberMXactId value to the slot reserved for the
1713 * prepared transaction.
1715 myOldestMember = OldestMemberMXactId[MyBackendId];
1716 if (MultiXactIdIsValid(myOldestMember))
1718 BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
1721 * Even though storing MultiXactId is atomic, acquire lock to make
1722 * sure others see both changes, not just the reset of the slot of the
1723 * current backend. Using a volatile pointer might suffice, but this
1726 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1728 OldestMemberMXactId[dummyBackendId] = myOldestMember;
1729 OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1731 LWLockRelease(MultiXactGenLock);
1735 * We don't need to transfer OldestVisibleMXactId value, because the
1736 * transaction is not going to be looking at any more multixacts once it's
1739 * We assume that storing a MultiXactId is atomic and so we need not take
1740 * MultiXactGenLock to do this.
1742 OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1745 * Discard the local MultiXactId cache like in AtEOX_MultiXact
1747 MXactContext = NULL;
1748 dlist_init(&MXactCache);
1749 MXactCacheMembers = 0;
1753 * multixact_twophase_recover
1754 * Recover the state of a prepared transaction at startup
1757 multixact_twophase_recover(TransactionId xid, uint16 info,
1758 void *recdata, uint32 len)
1760 BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
1761 MultiXactId oldestMember;
1764 * Get the oldest member XID from the state file record, and set it in the
1765 * OldestMemberMXactId slot reserved for this prepared transaction.
1767 Assert(len == sizeof(MultiXactId));
1768 oldestMember = *((MultiXactId *) recdata);
1770 OldestMemberMXactId[dummyBackendId] = oldestMember;
1774 * multixact_twophase_postcommit
1775 * Similar to AtEOX_MultiXact but for COMMIT PREPARED
1778 multixact_twophase_postcommit(TransactionId xid, uint16 info,
1779 void *recdata, uint32 len)
1781 BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true);
1783 Assert(len == sizeof(MultiXactId));
1785 OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
1789 * multixact_twophase_postabort
1790 * This is actually just the same as the COMMIT case.
1793 multixact_twophase_postabort(TransactionId xid, uint16 info,
1794 void *recdata, uint32 len)
1796 multixact_twophase_postcommit(xid, info, recdata, len);
1800 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1801 * thus double memory. Also, reserve space for the shared MultiXactState
1802 * struct and the per-backend MultiXactId arrays (two of those, too).
1805 MultiXactShmemSize(void)
1809 /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */
1810 #define SHARED_MULTIXACT_STATE_SIZE \
1811 add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \
1812 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1814 size = SHARED_MULTIXACT_STATE_SIZE;
1815 size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
1816 size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
1822 MultiXactShmemInit(void)
1826 debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1828 MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1829 MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1831 SimpleLruInit(MultiXactOffsetCtl,
1832 "multixact_offset", NUM_MXACTOFFSET_BUFFERS, 0,
1833 MultiXactOffsetControlLock, "pg_multixact/offsets",
1834 LWTRANCHE_MXACTOFFSET_BUFFERS);
1835 SimpleLruInit(MultiXactMemberCtl,
1836 "multixact_member", NUM_MXACTMEMBER_BUFFERS, 0,
1837 MultiXactMemberControlLock, "pg_multixact/members",
1838 LWTRANCHE_MXACTMEMBER_BUFFERS);
1840 /* Initialize our shared state struct */
1841 MultiXactState = ShmemInitStruct("Shared MultiXact State",
1842 SHARED_MULTIXACT_STATE_SIZE,
1844 if (!IsUnderPostmaster)
1848 /* Make sure we zero out the per-backend state */
1849 MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1855 * Set up array pointers. Note that perBackendXactIds[0] is wasted space
1856 * since we only use indexes 1..MaxOldestSlot in each array.
1858 OldestMemberMXactId = MultiXactState->perBackendXactIds;
1859 OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
1863 * This func must be called ONCE on system install. It creates the initial
1864 * MultiXact segments. (The MultiXacts directories are assumed to have been
1865 * created by initdb, and MultiXactShmemInit must have been called already.)
1868 BootStrapMultiXact(void)
1872 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1874 /* Create and zero the first page of the offsets log */
1875 slotno = ZeroMultiXactOffsetPage(0, false);
1877 /* Make sure it's written out */
1878 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1879 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
1881 LWLockRelease(MultiXactOffsetControlLock);
1883 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1885 /* Create and zero the first page of the members log */
1886 slotno = ZeroMultiXactMemberPage(0, false);
1888 /* Make sure it's written out */
1889 SimpleLruWritePage(MultiXactMemberCtl, slotno);
1890 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
1892 LWLockRelease(MultiXactMemberControlLock);
1896 * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
1897 * If writeXlog is true, also emit an XLOG record saying we did this.
1899 * The page is not actually written, just set up in shared memory.
1900 * The slot number of the new page is returned.
1902 * Control lock must be held at entry, and will be held at exit.
1905 ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
1909 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
1912 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
1918 * Ditto, for MultiXactMember
1921 ZeroMultiXactMemberPage(int pageno, bool writeXlog)
1925 slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
1928 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
1934 * MaybeExtendOffsetSlru
1935 * Extend the offsets SLRU area, if necessary
1937 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
1938 * contain files that are shorter than necessary; this would occur if the old
1939 * installation had used multixacts beyond the first page (files cannot be
1940 * copied, because the on-disk representation is different). pg_upgrade would
1941 * update pg_control to set the next offset value to be at that position, so
1942 * that tuples marked as locked by such MultiXacts would be seen as visible
1943 * without having to consult multixact. However, trying to create and use a
1944 * new MultiXactId would result in an error because the page on which the new
1945 * value would reside does not exist. This routine is in charge of creating
1949 MaybeExtendOffsetSlru(void)
1953 pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
1955 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1957 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
1962 * Fortunately for us, SimpleLruWritePage is already prepared to deal
1963 * with creating a new segment file even if the page we're writing is
1964 * not the first in it, so this is enough.
1966 slotno = ZeroMultiXactOffsetPage(pageno, false);
1967 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1970 LWLockRelease(MultiXactOffsetControlLock);
1974 * This must be called ONCE during postmaster or standalone-backend startup.
1976 * StartupXLOG has already established nextMXact/nextOffset by calling
1977 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
1978 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
1982 StartupMultiXact(void)
1984 MultiXactId multi = MultiXactState->nextMXact;
1985 MultiXactOffset offset = MultiXactState->nextOffset;
1989 * Initialize offset's idea of the latest page number.
1991 pageno = MultiXactIdToOffsetPage(multi);
1992 MultiXactOffsetCtl->shared->latest_page_number = pageno;
1995 * Initialize member's idea of the latest page number.
1997 pageno = MXOffsetToMemberPage(offset);
1998 MultiXactMemberCtl->shared->latest_page_number = pageno;
2002 * This must be called ONCE at the end of startup/recovery.
2007 MultiXactId nextMXact;
2008 MultiXactOffset offset;
2009 MultiXactId oldestMXact;
2015 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2016 nextMXact = MultiXactState->nextMXact;
2017 offset = MultiXactState->nextOffset;
2018 oldestMXact = MultiXactState->oldestMultiXactId;
2019 oldestMXactDB = MultiXactState->oldestMultiXactDB;
2020 LWLockRelease(MultiXactGenLock);
2022 /* Clean up offsets state */
2023 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2026 * (Re-)Initialize our idea of the latest page number for offsets.
2028 pageno = MultiXactIdToOffsetPage(nextMXact);
2029 MultiXactOffsetCtl->shared->latest_page_number = pageno;
2032 * Zero out the remainder of the current offsets page. See notes in
2033 * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2034 * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2035 * rule "write xlog before data," nextMXact successors may carry obsolete,
2036 * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2037 * operates normally.
2039 entryno = MultiXactIdToOffsetEntry(nextMXact);
2043 MultiXactOffset *offptr;
2045 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2046 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2049 MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2051 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2054 LWLockRelease(MultiXactOffsetControlLock);
2056 /* And the same for members */
2057 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2060 * (Re-)Initialize our idea of the latest page number for members.
2062 pageno = MXOffsetToMemberPage(offset);
2063 MultiXactMemberCtl->shared->latest_page_number = pageno;
2066 * Zero out the remainder of the current members page. See notes in
2067 * TrimCLOG() for motivation.
2069 flagsoff = MXOffsetToFlagsOffset(offset);
2073 TransactionId *xidptr;
2076 memberoff = MXOffsetToMemberOffset(offset);
2077 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2078 xidptr = (TransactionId *)
2079 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2081 MemSet(xidptr, 0, BLCKSZ - memberoff);
2084 * Note: we don't need to zero out the flag bits in the remaining
2085 * members of the current group, because they are always reset before
2089 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2092 LWLockRelease(MultiXactMemberControlLock);
2094 /* signal that we're officially up */
2095 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2096 MultiXactState->finishedStartup = true;
2097 LWLockRelease(MultiXactGenLock);
2099 /* Now compute how far away the next members wraparound is. */
2100 SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2104 * This must be called ONCE during postmaster or standalone-backend shutdown
2107 ShutdownMultiXact(void)
2109 /* Flush dirty MultiXact pages to disk */
2110 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
2111 SimpleLruFlush(MultiXactOffsetCtl, false);
2112 SimpleLruFlush(MultiXactMemberCtl, false);
2113 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
2117 * Get the MultiXact data to save in a checkpoint record
2120 MultiXactGetCheckptMulti(bool is_shutdown,
2121 MultiXactId *nextMulti,
2122 MultiXactOffset *nextMultiOffset,
2123 MultiXactId *oldestMulti,
2126 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2127 *nextMulti = MultiXactState->nextMXact;
2128 *nextMultiOffset = MultiXactState->nextOffset;
2129 *oldestMulti = MultiXactState->oldestMultiXactId;
2130 *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2131 LWLockRelease(MultiXactGenLock);
2134 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2135 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2139 * Perform a checkpoint --- either during shutdown, or on-the-fly
2142 CheckPointMultiXact(void)
2144 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2146 /* Flush dirty MultiXact pages to disk */
2147 SimpleLruFlush(MultiXactOffsetCtl, true);
2148 SimpleLruFlush(MultiXactMemberCtl, true);
2150 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2154 * Set the next-to-be-assigned MultiXactId and offset
2156 * This is used when we can determine the correct next ID/offset exactly
2157 * from a checkpoint record. Although this is only called during bootstrap
2158 * and XLog replay, we take the lock in case any hot-standby backends are
2159 * examining the values.
2162 MultiXactSetNextMXact(MultiXactId nextMulti,
2163 MultiXactOffset nextMultiOffset)
2165 debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2166 nextMulti, nextMultiOffset);
2167 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2168 MultiXactState->nextMXact = nextMulti;
2169 MultiXactState->nextOffset = nextMultiOffset;
2170 LWLockRelease(MultiXactGenLock);
2173 * During a binary upgrade, make sure that the offsets SLRU is large
2174 * enough to contain the next value that would be created.
2176 * We need to do this pretty early during the first startup in binary
2177 * upgrade mode: before StartupMultiXact() in fact, because this routine
2178 * is called even before that by StartupXLOG(). And we can't do it
2179 * earlier than at this point, because during that first call of this
2180 * routine we determine the MultiXactState->nextMXact value that
2181 * MaybeExtendOffsetSlru needs.
2183 if (IsBinaryUpgrade)
2184 MaybeExtendOffsetSlru();
2188 * Determine the last safe MultiXactId to allocate given the currently oldest
2189 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2190 * of our cluster), and the OID of the (or a) database with that value.
2192 * is_startup is true when we are just starting the cluster, false when we
2193 * are updating state in a running cluster. This only affects log messages.
2196 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2199 MultiXactId multiVacLimit;
2200 MultiXactId multiWarnLimit;
2201 MultiXactId multiStopLimit;
2202 MultiXactId multiWrapLimit;
2203 MultiXactId curMulti;
2204 bool needs_offset_vacuum;
2206 Assert(MultiXactIdIsValid(oldest_datminmxid));
2209 * We pretend that a wrap will happen halfway through the multixact ID
2210 * space, but that's not really true, because multixacts wrap differently
2211 * from transaction IDs. Note that, separately from any concern about
2212 * multixact IDs wrapping, we must ensure that multixact members do not
2213 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2215 multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2216 if (multiWrapLimit < FirstMultiXactId)
2217 multiWrapLimit += FirstMultiXactId;
2220 * We'll refuse to continue assigning MultiXactIds once we get within 100
2221 * multi of data loss.
2223 * Note: This differs from the magic number used in
2224 * SetTransactionIdLimit() since vacuum itself will never generate new
2225 * multis. XXX actually it does, if it needs to freeze old multis.
2227 multiStopLimit = multiWrapLimit - 100;
2228 if (multiStopLimit < FirstMultiXactId)
2229 multiStopLimit -= FirstMultiXactId;
2232 * We'll start complaining loudly when we get within 10M multis of the
2233 * stop point. This is kind of arbitrary, but if you let your gas gauge
2234 * get down to 1% of full, would you be looking for the next gas station?
2235 * We need to be fairly liberal about this number because there are lots
2236 * of scenarios where most transactions are done by automatic clients that
2237 * won't pay attention to warnings. (No, we're not gonna make this
2238 * configurable. If you know enough to configure it, you know enough to
2239 * not get in this kind of trouble in the first place.)
2241 multiWarnLimit = multiStopLimit - 10000000;
2242 if (multiWarnLimit < FirstMultiXactId)
2243 multiWarnLimit -= FirstMultiXactId;
2246 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2247 * be more than autovacuum_multixact_freeze_max_age mxids old.
2249 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2250 * so that we don't have to worry about dealing with on-the-fly changes in
2251 * its value. See SetTransactionIdLimit.
2253 multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2254 if (multiVacLimit < FirstMultiXactId)
2255 multiVacLimit += FirstMultiXactId;
2257 /* Grab lock for just long enough to set the new limit values */
2258 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2259 MultiXactState->oldestMultiXactId = oldest_datminmxid;
2260 MultiXactState->oldestMultiXactDB = oldest_datoid;
2261 MultiXactState->multiVacLimit = multiVacLimit;
2262 MultiXactState->multiWarnLimit = multiWarnLimit;
2263 MultiXactState->multiStopLimit = multiStopLimit;
2264 MultiXactState->multiWrapLimit = multiWrapLimit;
2265 curMulti = MultiXactState->nextMXact;
2266 LWLockRelease(MultiXactGenLock);
2270 (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
2271 multiWrapLimit, oldest_datoid)));
2274 * Computing the actual limits is only possible once the data directory is
2275 * in a consistent state. There's no need to compute the limits while
2276 * still replaying WAL - no decisions about new multis are made even
2277 * though multixact creations might be replayed. So we'll only do further
2278 * checks after TrimMultiXact() has been called.
2280 if (!MultiXactState->finishedStartup)
2283 Assert(!InRecovery);
2285 /* Set limits for offset vacuum. */
2286 needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2289 * If past the autovacuum force point, immediately signal an autovac
2290 * request. The reason for this is that autovac only processes one
2291 * database per invocation. Once it's finished cleaning up the oldest
2292 * database, it'll call here, and we'll signal the postmaster to start
2293 * another iteration immediately if there are still any old databases.
2295 if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2296 needs_offset_vacuum) && IsUnderPostmaster)
2297 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2299 /* Give an immediate warning if past the wrap warn point */
2300 if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2302 char *oldest_datname;
2305 * We can be called when not inside a transaction, for example during
2306 * StartupXLOG(). In such a case we cannot do database access, so we
2307 * must just report the oldest DB's OID.
2309 * Note: it's also possible that get_database_name fails and returns
2310 * NULL, for example because the database just got dropped. We'll
2311 * still warn, even though the warning might now be unnecessary.
2313 if (IsTransactionState())
2314 oldest_datname = get_database_name(oldest_datoid);
2316 oldest_datname = NULL;
2320 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2321 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2322 multiWrapLimit - curMulti,
2324 multiWrapLimit - curMulti),
2325 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2326 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2329 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2330 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2331 multiWrapLimit - curMulti,
2333 multiWrapLimit - curMulti),
2334 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2335 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2340 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2341 * and similarly nextOffset is at least minMultiOffset.
2343 * This is used when we can determine minimum safe values from an XLog
2344 * record (either an on-line checkpoint or an mxact creation log entry).
2345 * Although this is only called during XLog replay, we take the lock in case
2346 * any hot-standby backends are examining the values.
2349 MultiXactAdvanceNextMXact(MultiXactId minMulti,
2350 MultiXactOffset minMultiOffset)
2352 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2353 if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2355 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2356 MultiXactState->nextMXact = minMulti;
2358 if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2360 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2362 MultiXactState->nextOffset = minMultiOffset;
2364 LWLockRelease(MultiXactGenLock);
2368 * Update our oldestMultiXactId value, but only if it's more recent than what
2371 * This may only be called during WAL replay.
2374 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2378 if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2379 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2383 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2385 * NB: this is called while holding MultiXactGenLock. We want it to be very
2386 * fast most of the time; even when it's not so fast, no actual I/O need
2387 * happen unless we're forced to write out a dirty log or xlog page to make
2388 * room in shared memory.
2391 ExtendMultiXactOffset(MultiXactId multi)
2396 * No work except at first MultiXactId of a page. But beware: just after
2397 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2399 if (MultiXactIdToOffsetEntry(multi) != 0 &&
2400 multi != FirstMultiXactId)
2403 pageno = MultiXactIdToOffsetPage(multi);
2405 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2407 /* Zero the page and make an XLOG entry about it */
2408 ZeroMultiXactOffsetPage(pageno, true);
2410 LWLockRelease(MultiXactOffsetControlLock);
2414 * Make sure that MultiXactMember has room for the members of a newly-
2415 * allocated MultiXactId.
2417 * Like the above routine, this is called while holding MultiXactGenLock;
2418 * same comments apply.
2421 ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2424 * It's possible that the members span more than one page of the members
2425 * file, so we loop to ensure we consider each page. The coding is not
2426 * optimal if the members span several pages, but that seems unusual
2427 * enough to not worry much about.
2429 while (nmembers > 0)
2436 * Only zero when at first entry of a page.
2438 flagsoff = MXOffsetToFlagsOffset(offset);
2439 flagsbit = MXOffsetToFlagsBitShift(offset);
2440 if (flagsoff == 0 && flagsbit == 0)
2444 pageno = MXOffsetToMemberPage(offset);
2446 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2448 /* Zero the page and make an XLOG entry about it */
2449 ZeroMultiXactMemberPage(pageno, true);
2451 LWLockRelease(MultiXactMemberControlLock);
2455 * Compute the number of items till end of current page. Careful: if
2456 * addition of unsigned ints wraps around, we're at the last page of
2457 * the last segment; since that page holds a different number of items
2458 * than other pages, we need to do it differently.
2460 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2463 * This is the last page of the last segment; we can compute the
2464 * number of items left to allocate in it without modulo
2467 difference = MaxMultiXactOffset - offset + 1;
2470 difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2473 * Advance to next page, taking care to properly handle the wraparound
2474 * case. OK if nmembers goes negative.
2476 nmembers -= difference;
2477 offset += difference;
2482 * GetOldestMultiXactId
2484 * Return the oldest MultiXactId that's still possibly still seen as live by
2485 * any running transaction. Older ones might still exist on disk, but they no
2486 * longer have any running member transaction.
2488 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2489 * this function; however, it can be used by a full-table vacuum to set the
2490 * point at which it will be possible to truncate SLRU for that table.
2493 GetOldestMultiXactId(void)
2495 MultiXactId oldestMXact;
2496 MultiXactId nextMXact;
2500 * This is the oldest valid value among all the OldestMemberMXactId[] and
2501 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2503 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2506 * We have to beware of the possibility that nextMXact is in the
2507 * wrapped-around state. We don't fix the counter itself here, but we
2508 * must be sure to use a valid value in our calculation.
2510 nextMXact = MultiXactState->nextMXact;
2511 if (nextMXact < FirstMultiXactId)
2512 nextMXact = FirstMultiXactId;
2514 oldestMXact = nextMXact;
2515 for (i = 1; i <= MaxOldestSlot; i++)
2517 MultiXactId thisoldest;
2519 thisoldest = OldestMemberMXactId[i];
2520 if (MultiXactIdIsValid(thisoldest) &&
2521 MultiXactIdPrecedes(thisoldest, oldestMXact))
2522 oldestMXact = thisoldest;
2523 thisoldest = OldestVisibleMXactId[i];
2524 if (MultiXactIdIsValid(thisoldest) &&
2525 MultiXactIdPrecedes(thisoldest, oldestMXact))
2526 oldestMXact = thisoldest;
2529 LWLockRelease(MultiXactGenLock);
2535 * Determine how aggressively we need to vacuum in order to prevent member
2538 * To do so determine what's the oldest member offset and install the limit
2539 * info in MultiXactState, where it can be used to prevent overrun of old data
2540 * in the members SLRU area.
2542 * The return value is true if emergency autovacuum is required and false
2546 SetOffsetVacuumLimit(bool is_startup)
2548 MultiXactId oldestMultiXactId;
2549 MultiXactId nextMXact;
2550 MultiXactOffset oldestOffset = 0; /* placate compiler */
2551 MultiXactOffset prevOldestOffset;
2552 MultiXactOffset nextOffset;
2553 bool oldestOffsetKnown = false;
2554 bool prevOldestOffsetKnown;
2555 MultiXactOffset offsetStopLimit = 0;
2556 MultiXactOffset prevOffsetStopLimit;
2559 * NB: Have to prevent concurrent truncation, we might otherwise try to
2560 * lookup an oldestMulti that's concurrently getting truncated away.
2562 LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2564 /* Read relevant fields from shared memory. */
2565 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2566 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2567 nextMXact = MultiXactState->nextMXact;
2568 nextOffset = MultiXactState->nextOffset;
2569 prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2570 prevOldestOffset = MultiXactState->oldestOffset;
2571 prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2572 Assert(MultiXactState->finishedStartup);
2573 LWLockRelease(MultiXactGenLock);
2576 * Determine the offset of the oldest multixact. Normally, we can read
2577 * the offset from the multixact itself, but there's an important special
2578 * case: if there are no multixacts in existence at all, oldestMXact
2579 * obviously can't point to one. It will instead point to the multixact
2580 * ID that will be assigned the next time one is needed.
2582 if (oldestMultiXactId == nextMXact)
2585 * When the next multixact gets created, it will be stored at the next
2588 oldestOffset = nextOffset;
2589 oldestOffsetKnown = true;
2594 * Figure out where the oldest existing multixact's offsets are
2595 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2596 * the supposedly-earliest multixact might not really exist. We are
2597 * careful not to fail in that case.
2600 find_multixact_start(oldestMultiXactId, &oldestOffset);
2602 if (oldestOffsetKnown)
2604 (errmsg("oldest MultiXactId member is at offset %u",
2608 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2609 oldestMultiXactId)));
2612 LWLockRelease(MultiXactTruncationLock);
2615 * If we can, compute limits (and install them MultiXactState) to prevent
2616 * overrun of old data in the members SLRU area. We can only do so if the
2617 * oldest offset is known though.
2619 if (oldestOffsetKnown)
2621 /* move back to start of the corresponding segment */
2622 offsetStopLimit = oldestOffset - (oldestOffset %
2623 (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2625 /* always leave one segment before the wraparound point */
2626 offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2628 if (!prevOldestOffsetKnown && !is_startup)
2630 (errmsg("MultiXact member wraparound protections are now enabled")));
2633 (errmsg("MultiXact member stop limit is now %u based on MultiXact %u",
2634 offsetStopLimit, oldestMultiXactId)));
2636 else if (prevOldestOffsetKnown)
2639 * If we failed to get the oldest offset this time, but we have a
2640 * value from a previous pass through this function, use the old
2641 * values rather than automatically forcing an emergency autovacuum
2644 oldestOffset = prevOldestOffset;
2645 oldestOffsetKnown = true;
2646 offsetStopLimit = prevOffsetStopLimit;
2649 /* Install the computed values */
2650 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2651 MultiXactState->oldestOffset = oldestOffset;
2652 MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2653 MultiXactState->offsetStopLimit = offsetStopLimit;
2654 LWLockRelease(MultiXactGenLock);
2657 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2659 return !oldestOffsetKnown ||
2660 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2664 * Return whether adding "distance" to "start" would move past "boundary".
2666 * We use this to determine whether the addition is "wrapping around" the
2667 * boundary point, hence the name. The reason we don't want to use the regular
2668 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2669 * the 2^32-1 space here, allowing for more multixacts that would fit
2673 MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2676 MultiXactOffset finish;
2679 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2680 * if the addition wraps around the UINT_MAX boundary, skip that value.
2682 finish = start + distance;
2686 /*-----------------------------------------------------------------------
2687 * When the boundary is numerically greater than the starting point, any
2688 * value numerically between the two is not wrapped:
2691 * [---) = F wrapped past B (and UINT_MAX)
2692 * [---) = F not wrapped
2693 * [----] = F wrapped past B
2695 * When the boundary is numerically less than the starting point (i.e. the
2696 * UINT_MAX wraparound occurs somewhere in between) then all values in
2697 * between are wrapped:
2700 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2701 * [---) = F wrapped past B (and UINT_MAX)
2702 * [----] = F not wrapped
2703 *-----------------------------------------------------------------------
2705 if (start < boundary)
2706 return finish >= boundary || finish < start;
2708 return finish >= boundary && finish < start;
2712 * Find the starting offset of the given MultiXactId.
2714 * Returns false if the file containing the multi does not exist on disk.
2715 * Otherwise, returns true and sets *result to the starting member offset.
2717 * This function does not prevent concurrent truncation, so if that's
2718 * required, the caller has to protect against that.
2721 find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2723 MultiXactOffset offset;
2727 MultiXactOffset *offptr;
2729 Assert(MultiXactState->finishedStartup);
2731 pageno = MultiXactIdToOffsetPage(multi);
2732 entryno = MultiXactIdToOffsetEntry(multi);
2735 * Flush out dirty data, so PhysicalPageExists can work correctly.
2736 * SimpleLruFlush() is a pretty big hammer for that. Alternatively we
2737 * could add an in-memory version of page exists, but find_multixact_start
2738 * is called infrequently, and it doesn't seem bad to flush buffers to
2739 * disk before truncation.
2741 SimpleLruFlush(MultiXactOffsetCtl, true);
2742 SimpleLruFlush(MultiXactMemberCtl, true);
2744 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2747 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2748 slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2749 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2752 LWLockRelease(MultiXactOffsetControlLock);
2759 * Determine how many multixacts, and how many multixact members, currently
2760 * exist. Return false if unable to determine.
2763 ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
2765 MultiXactOffset nextOffset;
2766 MultiXactOffset oldestOffset;
2767 MultiXactId oldestMultiXactId;
2768 MultiXactId nextMultiXactId;
2769 bool oldestOffsetKnown;
2771 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2772 nextOffset = MultiXactState->nextOffset;
2773 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2774 nextMultiXactId = MultiXactState->nextMXact;
2775 oldestOffset = MultiXactState->oldestOffset;
2776 oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2777 LWLockRelease(MultiXactGenLock);
2779 if (!oldestOffsetKnown)
2782 *members = nextOffset - oldestOffset;
2783 *multixacts = nextMultiXactId - oldestMultiXactId;
2788 * Multixact members can be removed once the multixacts that refer to them
2789 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2790 * vacuum_multixact_freeze_table_age work together to make sure we never have
2791 * too many multixacts; we hope that, at least under normal circumstances,
2792 * this will also be sufficient to keep us from using too many offsets.
2793 * However, if the average multixact has many members, we might exhaust the
2794 * members space while still using few enough members that these limits fail
2795 * to trigger full table scans for relminmxid advancement. At that point,
2796 * we'd have no choice but to start failing multixact-creating operations
2799 * To prevent that, if more than a threshold portion of the members space is
2800 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2801 * to a value just less than the number of multixacts in use. We hope that
2802 * this will quickly trigger autovacuuming on the table or tables with the
2803 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2806 * As the fraction of the member space currently in use grows, we become
2807 * more aggressive in clamping this value. That not only causes autovacuum
2808 * to ramp up, but also makes any manual vacuums the user issues more
2809 * aggressive. This happens because vacuum_set_xid_limits() clamps the
2810 * freeze table and the minimum freeze age based on the effective
2811 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2812 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2813 * table will try to freeze every multixact.
2815 * It's possible that these thresholds should be user-tunable, but for now
2816 * we keep it simple.
2819 MultiXactMemberFreezeThreshold(void)
2821 MultiXactOffset members;
2823 uint32 victim_multixacts;
2826 /* If we can't determine member space utilization, assume the worst. */
2827 if (!ReadMultiXactCounts(&multixacts, &members))
2830 /* If member space utilization is low, no special action is required. */
2831 if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2832 return autovacuum_multixact_freeze_max_age;
2835 * Compute a target for relminmxid advancement. The number of multixacts
2836 * we try to eliminate from the system is based on how far we are past
2837 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2839 fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2840 (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
2841 victim_multixacts = multixacts * fraction;
2843 /* fraction could be > 1.0, but lowest possible freeze age is zero */
2844 if (victim_multixacts > multixacts)
2846 return multixacts - victim_multixacts;
2849 typedef struct mxtruncinfo
2851 int earliestExistingPage;
2855 * SlruScanDirectory callback
2856 * This callback determines the earliest existing page number.
2859 SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
2861 mxtruncinfo *trunc = (mxtruncinfo *) data;
2863 if (trunc->earliestExistingPage == -1 ||
2864 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2866 trunc->earliestExistingPage = segpage;
2869 return false; /* keep going */
2874 * Delete members segments [oldest, newOldest)
2876 * The members SLRU can, in contrast to the offsets one, be filled to almost
2877 * the full range at once. This means SimpleLruTruncate() can't trivially be
2878 * used - instead the to-be-deleted range is computed using the offsets
2879 * SLRU. C.f. TruncateMultiXact().
2882 PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
2884 const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
2885 int startsegment = MXOffsetToMemberSegment(oldestOffset);
2886 int endsegment = MXOffsetToMemberSegment(newOldestOffset);
2887 int segment = startsegment;
2890 * Delete all the segments but the last one. The last segment can still
2891 * contain, possibly partially, valid data.
2893 while (segment != endsegment)
2895 elog(DEBUG2, "truncating multixact members segment %x", segment);
2896 SlruDeleteSegment(MultiXactMemberCtl, segment);
2898 /* move to next segment, handling wraparound correctly */
2899 if (segment == maxsegment)
2907 * Delete offsets segments [oldest, newOldest)
2910 PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
2913 * We step back one multixact to avoid passing a cutoff page that hasn't
2914 * been created yet in the rare case that oldestMulti would be the first
2915 * item on a page and oldestMulti == nextMulti. In that case, if we
2916 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
2919 SimpleLruTruncate(MultiXactOffsetCtl,
2920 MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
2924 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
2925 * ones still of interest.
2927 * This is only called on a primary as part of vacuum (via
2928 * vac_truncate_clog()). During recovery truncation is done by replaying
2929 * truncation WAL records logged here.
2931 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
2932 * is one of the databases preventing newOldestMulti from increasing.
2935 TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
2937 MultiXactId oldestMulti;
2938 MultiXactId nextMulti;
2939 MultiXactOffset newOldestOffset;
2940 MultiXactOffset oldestOffset;
2941 MultiXactOffset nextOffset;
2943 MultiXactId earliest;
2945 Assert(!RecoveryInProgress());
2946 Assert(MultiXactState->finishedStartup);
2949 * We can only allow one truncation to happen at once. Otherwise parts of
2950 * members might vanish while we're doing lookups or similar. There's no
2951 * need to have an interlock with creating new multis or such, since those
2952 * are constrained by the limits (which only grow, never shrink).
2954 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2956 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2957 nextMulti = MultiXactState->nextMXact;
2958 nextOffset = MultiXactState->nextOffset;
2959 oldestMulti = MultiXactState->oldestMultiXactId;
2960 LWLockRelease(MultiXactGenLock);
2961 Assert(MultiXactIdIsValid(oldestMulti));
2964 * Make sure to only attempt truncation if there's values to truncate
2965 * away. In normal processing values shouldn't go backwards, but there's
2966 * some corner cases (due to bugs) where that's possible.
2968 if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
2970 LWLockRelease(MultiXactTruncationLock);
2975 * Note we can't just plow ahead with the truncation; it's possible that
2976 * there are no segments to truncate, which is a problem because we are
2977 * going to attempt to read the offsets page to determine where to
2978 * truncate the members SLRU. So we first scan the directory to determine
2979 * the earliest offsets page number that we can read without error.
2981 * NB: It's also possible that the page that oldestMulti is on has already
2982 * been truncated away, and we crashed before updating oldestMulti.
2984 trunc.earliestExistingPage = -1;
2985 SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
2986 earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
2987 if (earliest < FirstMultiXactId)
2988 earliest = FirstMultiXactId;
2990 /* If there's nothing to remove, we can bail out early. */
2991 if (MultiXactIdPrecedes(oldestMulti, earliest))
2993 LWLockRelease(MultiXactTruncationLock);
2998 * First, compute the safe truncation point for MultiXactMember. This is
2999 * the starting offset of the oldest multixact.
3001 * Hopefully, find_multixact_start will always work here, because we've
3002 * already checked that it doesn't precede the earliest MultiXact on disk.
3003 * But if it fails, don't truncate anything, and log a message.
3005 if (oldestMulti == nextMulti)
3007 /* there are NO MultiXacts */
3008 oldestOffset = nextOffset;
3010 else if (!find_multixact_start(oldestMulti, &oldestOffset))
3013 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3014 oldestMulti, earliest)));
3015 LWLockRelease(MultiXactTruncationLock);
3020 * Secondly compute up to where to truncate. Lookup the corresponding
3021 * member offset for newOldestMulti for that.
3023 if (newOldestMulti == nextMulti)
3025 /* there are NO MultiXacts */
3026 newOldestOffset = nextOffset;
3028 else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3031 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3033 LWLockRelease(MultiXactTruncationLock);
3037 elog(DEBUG1, "performing multixact truncation: "
3038 "offsets [%u, %u), offsets segments [%x, %x), "
3039 "members [%u, %u), members segments [%x, %x)",
3040 oldestMulti, newOldestMulti,
3041 MultiXactIdToOffsetSegment(oldestMulti),
3042 MultiXactIdToOffsetSegment(newOldestMulti),
3043 oldestOffset, newOldestOffset,
3044 MXOffsetToMemberSegment(oldestOffset),
3045 MXOffsetToMemberSegment(newOldestOffset));
3048 * Do truncation, and the WAL logging of the truncation, in a critical
3049 * section. That way offsets/members cannot get out of sync anymore, i.e.
3050 * once consistent the newOldestMulti will always exist in members, even
3051 * if we crashed in the wrong moment.
3053 START_CRIT_SECTION();
3056 * Prevent checkpoints from being scheduled concurrently. This is critical
3057 * because otherwise a truncation record might not be replayed after a
3058 * crash/basebackup, even though the state of the data directory would
3061 Assert(!MyPgXact->delayChkpt);
3062 MyPgXact->delayChkpt = true;
3064 /* WAL log truncation */
3065 WriteMTruncateXlogRec(newOldestMultiDB,
3066 oldestMulti, newOldestMulti,
3067 oldestOffset, newOldestOffset);
3070 * Update in-memory limits before performing the truncation, while inside
3071 * the critical section: Have to do it before truncation, to prevent
3072 * concurrent lookups of those values. Has to be inside the critical
3073 * section as otherwise a future call to this function would error out,
3074 * while looking up the oldest member in offsets, if our caller crashes
3075 * before updating the limits.
3077 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3078 MultiXactState->oldestMultiXactId = newOldestMulti;
3079 MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3080 LWLockRelease(MultiXactGenLock);
3082 /* First truncate members */
3083 PerformMembersTruncation(oldestOffset, newOldestOffset);
3086 PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3088 MyPgXact->delayChkpt = false;
3091 LWLockRelease(MultiXactTruncationLock);
3095 * Decide which of two MultiXactOffset page numbers is "older" for truncation
3098 * We need to use comparison of MultiXactId here in order to do the right
3099 * thing with wraparound. However, if we are asked about page number zero, we
3100 * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
3101 * weird. So, offset both multis by FirstMultiXactId to avoid that.
3102 * (Actually, the current implementation doesn't do anything weird with
3103 * InvalidMultiXactId, but there's no harm in leaving this code like this.)
3106 MultiXactOffsetPagePrecedes(int page1, int page2)
3111 multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3112 multi1 += FirstMultiXactId;
3113 multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3114 multi2 += FirstMultiXactId;
3116 return MultiXactIdPrecedes(multi1, multi2);
3120 * Decide which of two MultiXactMember page numbers is "older" for truncation
3121 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3124 MultiXactMemberPagePrecedes(int page1, int page2)
3126 MultiXactOffset offset1;
3127 MultiXactOffset offset2;
3129 offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3130 offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3132 return MultiXactOffsetPrecedes(offset1, offset2);
3136 * Decide which of two MultiXactIds is earlier.
3138 * XXX do we need to do something special for InvalidMultiXactId?
3139 * (Doesn't look like it.)
3142 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3144 int32 diff = (int32) (multi1 - multi2);
3150 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3152 * XXX do we need to do something special for InvalidMultiXactId?
3153 * (Doesn't look like it.)
3156 MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3158 int32 diff = (int32) (multi1 - multi2);
3165 * Decide which of two offsets is earlier.
3168 MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3170 int32 diff = (int32) (offset1 - offset2);
3176 * Write an xlog record reflecting the zeroing of either a MEMBERs or
3177 * OFFSETs page (info shows which)
3180 WriteMZeroPageXlogRec(int pageno, uint8 info)
3183 XLogRegisterData((char *) (&pageno), sizeof(int));
3184 (void) XLogInsert(RM_MULTIXACT_ID, info);
3188 * Write a TRUNCATE xlog record
3190 * We must flush the xlog record to disk before returning --- see notes in
3194 WriteMTruncateXlogRec(Oid oldestMultiDB,
3195 MultiXactId startTruncOff, MultiXactId endTruncOff,
3196 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3199 xl_multixact_truncate xlrec;
3201 xlrec.oldestMultiDB = oldestMultiDB;
3203 xlrec.startTruncOff = startTruncOff;
3204 xlrec.endTruncOff = endTruncOff;
3206 xlrec.startTruncMemb = startTruncMemb;
3207 xlrec.endTruncMemb = endTruncMemb;
3210 XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3211 recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3216 * MULTIXACT resource manager's routines
3219 multixact_redo(XLogReaderState *record)
3221 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3223 /* Backup blocks are not used in multixact records */
3224 Assert(!XLogRecHasAnyBlockRefs(record));
3226 if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3231 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3233 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
3235 slotno = ZeroMultiXactOffsetPage(pageno, false);
3236 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3237 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3239 LWLockRelease(MultiXactOffsetControlLock);
3241 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3246 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3248 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
3250 slotno = ZeroMultiXactMemberPage(pageno, false);
3251 SimpleLruWritePage(MultiXactMemberCtl, slotno);
3252 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3254 LWLockRelease(MultiXactMemberControlLock);
3256 else if (info == XLOG_MULTIXACT_CREATE_ID)
3258 xl_multixact_create *xlrec =
3259 (xl_multixact_create *) XLogRecGetData(record);
3260 TransactionId max_xid;
3263 /* Store the data back into the SLRU files */
3264 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3267 /* Make sure nextMXact/nextOffset are beyond what this record has */
3268 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3269 xlrec->moff + xlrec->nmembers);
3272 * Make sure nextFullXid is beyond any XID mentioned in the record.
3273 * This should be unnecessary, since any XID found here ought to have
3274 * other evidence in the XLOG, but let's be safe.
3276 max_xid = XLogRecGetXid(record);
3277 for (i = 0; i < xlrec->nmembers; i++)
3279 if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3280 max_xid = xlrec->members[i].xid;
3283 AdvanceNextFullTransactionIdPastXid(max_xid);
3285 else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3287 xl_multixact_truncate xlrec;
3290 memcpy(&xlrec, XLogRecGetData(record),
3291 SizeOfMultiXactTruncate);
3293 elog(DEBUG1, "replaying multixact truncation: "
3294 "offsets [%u, %u), offsets segments [%x, %x), "
3295 "members [%u, %u), members segments [%x, %x)",
3296 xlrec.startTruncOff, xlrec.endTruncOff,
3297 MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3298 MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3299 xlrec.startTruncMemb, xlrec.endTruncMemb,
3300 MXOffsetToMemberSegment(xlrec.startTruncMemb),
3301 MXOffsetToMemberSegment(xlrec.endTruncMemb));
3303 /* should not be required, but more than cheap enough */
3304 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3307 * Advance the horizon values, so they're current at the end of
3310 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3312 PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3315 * During XLOG replay, latest_page_number isn't necessarily set up
3316 * yet; insert a suitable value to bypass the sanity test in
3317 * SimpleLruTruncate.
3319 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3320 MultiXactOffsetCtl->shared->latest_page_number = pageno;
3321 PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3323 LWLockRelease(MultiXactTruncationLock);
3326 elog(PANIC, "multixact_redo: unknown op code %u", info);
3330 pg_get_multixact_members(PG_FUNCTION_ARGS)
3334 MultiXactMember *members;
3338 MultiXactId mxid = PG_GETARG_UINT32(0);
3340 FuncCallContext *funccxt;
3342 if (mxid < FirstMultiXactId)
3344 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3345 errmsg("invalid MultiXactId: %u", mxid)));
3347 if (SRF_IS_FIRSTCALL())
3349 MemoryContext oldcxt;
3352 funccxt = SRF_FIRSTCALL_INIT();
3353 oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3355 multi = palloc(sizeof(mxact));
3356 /* no need to allow for old values here */
3357 multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3361 tupdesc = CreateTemplateTupleDesc(2);
3362 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
3364 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
3367 funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3368 funccxt->user_fctx = multi;
3370 MemoryContextSwitchTo(oldcxt);
3373 funccxt = SRF_PERCALL_SETUP();
3374 multi = (mxact *) funccxt->user_fctx;
3376 while (multi->iter < multi->nmembers)
3381 values[0] = psprintf("%u", multi->members[multi->iter].xid);
3382 values[1] = mxstatus_to_string(multi->members[multi->iter].status);
3384 tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3388 SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3391 if (multi->nmembers > 0)
3392 pfree(multi->members);
3395 SRF_RETURN_DONE(funccxt);