granicus.if.org Git - postgresql/blob - src/backend/access/transam/multixact.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * multixact.c
   4  *              PostgreSQL multi-transaction-log manager
   5  *
   6  * The pg_multixact manager is a pg_clog-like manager that stores an array of
   7  * MultiXactMember for each MultiXactId.  It is a fundamental part of the
   8  * shared-row-lock implementation.  Each MultiXactMember is comprised of a
   9  * TransactionId and a set of flag bits.  The name is a bit historical:
  10  * originally, a MultiXactId consisted of more than one TransactionId (except
  11  * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
  12  * legitimate to have MultiXactIds that only include a single Xid.
  13  *
  14  * The meaning of the flag bits is opaque to this module, but they are mostly
  15  * used in heapam.c to identify lock modes that each of the member transactions
  16  * is holding on any given tuple.  This module just contains support to store
  17  * and retrieve the arrays.
  18  *
  19  * We use two SLRU areas, one for storing the offsets at which the data
  20  * starts for each MultiXactId in the other one.  This trick allows us to
  21  * store variable length arrays of TransactionIds.  (We could alternatively
  22  * use one area containing counts and TransactionIds, with valid MultiXactId
  23  * values pointing at slots containing counts; but that way seems less robust
  24  * since it would get completely confused if someone inquired about a bogus
  25  * MultiXactId that pointed to an intermediate slot containing an XID.)
  26  *
  27  * XLOG interactions: this module generates an XLOG record whenever a new
  28  * OFFSETs or MEMBERs page is initialized to zeroes, as well as an XLOG record
  29  * whenever a new MultiXactId is defined.  This allows us to completely
  30  * rebuild the data entered since the last checkpoint during XLOG replay.
  31  * Because this is possible, we need not follow the normal rule of
  32  * "write WAL before data"; the only correctness guarantee needed is that
  33  * we flush and sync all dirty OFFSETs and MEMBERs pages to disk before a
  34  * checkpoint is considered complete.  If a page does make it to disk ahead
  35  * of corresponding WAL records, it will be forcibly zeroed before use anyway.
  36  * Therefore, we don't need to mark our pages with LSN information; we have
  37  * enough synchronization already.
  38  *
  39  * Like clog.c, and unlike subtrans.c, we have to preserve state across
  40  * crashes and ensure that MXID and offset numbering increases monotonically
  41  * across a crash.  We do this in the same way as it's done for transaction
  42  * IDs: the WAL record is guaranteed to contain evidence of every MXID we
  43  * could need to worry about, and we just make sure that at the end of
  44  * replay, the next-MXID and next-offset counters are at least as large as
  45  * anything we saw during replay.
  46  *
  47  * We are able to remove segments no longer necessary by carefully tracking
  48  * each table's used values: during vacuum, any multixact older than a certain
  49  * value is removed; the cutoff value is stored in pg_class.  The minimum value
  50  * across all tables in each database is stored in pg_database, and the global
  51  * minimum across all databases is part of pg_control and is kept in shared
  52  * memory.  Whenever that minimum is advanced, the SLRUs are truncated.
  53  *
  54  * When new multixactid values are to be created, care is taken that the
  55  * counter does not fall within the wraparound horizon considering the global
  56  * minimum value.
  57  *
  58  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
  59  * Portions Copyright (c) 1994, Regents of the University of California
  60  *
  61  * src/backend/access/transam/multixact.c
  62  *
  63  *-------------------------------------------------------------------------
  64  */
  65 #include "postgres.h"
  66
  67 #include "access/multixact.h"
  68 #include "access/slru.h"
  69 #include "access/transam.h"
  70 #include "access/twophase.h"
  71 #include "access/twophase_rmgr.h"
  72 #include "access/xact.h"
  73 #include "access/xlog.h"
  74 #include "access/xloginsert.h"
  75 #include "catalog/pg_type.h"
  76 #include "commands/dbcommands.h"
  77 #include "funcapi.h"
  78 #include "lib/ilist.h"
  79 #include "miscadmin.h"
  80 #include "pg_trace.h"
  81 #include "postmaster/autovacuum.h"
  82 #include "storage/lmgr.h"
  83 #include "storage/pmsignal.h"
  84 #include "storage/proc.h"
  85 #include "storage/procarray.h"
  86 #include "utils/builtins.h"
  87 #include "utils/memutils.h"
  88 #include "utils/snapmgr.h"
  89
  90
  91 /*
  92  * Defines for MultiXactOffset page sizes.  A page is the same BLCKSZ as is
  93  * used everywhere else in Postgres.
  94  *
  95  * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
  96  * MultiXact page numbering also wraps around at
  97  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
  98  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need
  99  * take no explicit notice of that fact in this module, except when comparing
 100  * segment and page numbers in TruncateMultiXact (see
 101  * MultiXactOffsetPagePrecedes).
 102  */
 103
 104 /* We need four bytes per offset */
 105 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
 106
 107 #define MultiXactIdToOffsetPage(xid) \
 108         ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 109 #define MultiXactIdToOffsetEntry(xid) \
 110         ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 111 #define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)
 112
 113 /*
 114  * The situation for members is a bit more complex: we store one byte of
 115  * additional flag bits for each TransactionId.  To do this without getting
 116  * into alignment issues, we store four bytes of flags, and then the
 117  * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
 118  * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
 119  * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
 120  * performance) trumps space efficiency here.
 121  *
 122  * Note that the "offset" macros work with byte offset, not array indexes, so
 123  * arithmetic must be done using "char *" pointers.
 124  */
 125 /* We need eight bits per xact, so one xact fits in a byte */
 126 #define MXACT_MEMBER_BITS_PER_XACT                      8
 127 #define MXACT_MEMBER_FLAGS_PER_BYTE                     1
 128 #define MXACT_MEMBER_XACT_BITMASK       ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
 129
 130 /* how many full bytes of flags are there in a group? */
 131 #define MULTIXACT_FLAGBYTES_PER_GROUP           4
 132 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP       \
 133         (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
 134 /* size in bytes of a complete group */
 135 #define MULTIXACT_MEMBERGROUP_SIZE \
 136         (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
 137 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
 138 #define MULTIXACT_MEMBERS_PER_PAGE      \
 139         (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
 140
 141 /*
 142  * Because the number of items per page is not a divisor of the last item
 143  * number (member 0xFFFFFFFF), the last segment does not use the maximum number
 144  * of pages, and moreover the last used page therein does not use the same
 145  * number of items as previous pages.  (Another way to say it is that the
 146  * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
 147  * has some empty space after that item.)
 148  *
 149  * This constant is the number of members in the last page of the last segment.
 150  */
 151 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
 152                 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
 153
 154 /* page in which a member is to be found */
 155 #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
 156 #define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)
 157
 158 /* Location (byte offset within page) of flag word for a given member */
 159 #define MXOffsetToFlagsOffset(xid) \
 160         ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
 161           (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
 162          (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
 163 #define MXOffsetToFlagsBitShift(xid) \
 164         (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
 165          MXACT_MEMBER_BITS_PER_XACT)
 166
 167 /* Location (byte offset within page) of TransactionId of given member */
 168 #define MXOffsetToMemberOffset(xid) \
 169         (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
 170          ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
 171
 172 /* Multixact members wraparound thresholds. */
 173 #define MULTIXACT_MEMBER_SAFE_THRESHOLD         (MaxMultiXactOffset / 2)
 174 #define MULTIXACT_MEMBER_DANGER_THRESHOLD       \
 175         (MaxMultiXactOffset - MaxMultiXactOffset / 4)
 176
 177 #define PreviousMultiXactId(xid) \
 178         ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1)
 179
 180 /*
 181  * Links to shared-memory data structures for MultiXact control
 182  */
 183 static SlruCtlData MultiXactOffsetCtlData;
 184 static SlruCtlData MultiXactMemberCtlData;
 185
 186 #define MultiXactOffsetCtl      (&MultiXactOffsetCtlData)
 187 #define MultiXactMemberCtl      (&MultiXactMemberCtlData)
 188
 189 /*
 190  * MultiXact state shared across all backends.  All this state is protected
 191  * by MultiXactGenLock.  (We also use MultiXactOffsetControlLock and
 192  * MultiXactMemberControlLock to guard accesses to the two sets of SLRU
 193  * buffers.  For concurrency's sake, we avoid holding more than one of these
 194  * locks at a time.)
 195  */
 196 typedef struct MultiXactStateData
 197 {
 198         /* next-to-be-assigned MultiXactId */
 199         MultiXactId nextMXact;
 200
 201         /* next-to-be-assigned offset */
 202         MultiXactOffset nextOffset;
 203
 204         /* Have we completed multixact startup? */
 205         bool            finishedStartup;
 206
 207         /*
 208          * Oldest multixact that is still potentially referenced by a relation.
 209          * Anything older than this should not be consulted.  These values are
 210          * updated by vacuum.
 211          */
 212         MultiXactId oldestMultiXactId;
 213         Oid                     oldestMultiXactDB;
 214
 215         /*
 216          * Oldest multixact offset that is potentially referenced by a multixact
 217          * referenced by a relation.  We don't always know this value, so there's
 218          * a flag here to indicate whether or not we currently do.
 219          */
 220         MultiXactOffset oldestOffset;
 221         bool            oldestOffsetKnown;
 222
 223         /*
 224          * True if a multixact truncation WAL record was replayed since the last
 225          * checkpoint. This is used to trigger 'legacy truncations', i.e. truncate
 226          * by looking at the data directory during WAL replay, when the primary is
 227          * too old to generate truncation records.
 228          */
 229         bool            sawTruncationInCkptCycle;
 230
 231         /* support for anti-wraparound measures */
 232         MultiXactId multiVacLimit;
 233         MultiXactId multiWarnLimit;
 234         MultiXactId multiStopLimit;
 235         MultiXactId multiWrapLimit;
 236
 237         /* support for members anti-wraparound measures */
 238         MultiXactOffset offsetStopLimit;        /* known if oldestOffsetKnown */
 239
 240         /*
 241          * Per-backend data starts here.  We have two arrays stored in the area
 242          * immediately following the MultiXactStateData struct. Each is indexed by
 243          * BackendId.
 244          *
 245          * In both arrays, there's a slot for all normal backends (1..MaxBackends)
 246          * followed by a slot for max_prepared_xacts prepared transactions. Valid
 247          * BackendIds start from 1; element zero of each array is never used.
 248          *
 249          * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
 250          * transaction(s) could possibly be a member of, or InvalidMultiXactId
 251          * when the backend has no live transaction that could possibly be a
 252          * member of a MultiXact.  Each backend sets its entry to the current
 253          * nextMXact counter just before first acquiring a shared lock in a given
 254          * transaction, and clears it at transaction end. (This works because only
 255          * during or after acquiring a shared lock could an XID possibly become a
 256          * member of a MultiXact, and that MultiXact would have to be created
 257          * during or after the lock acquisition.)
 258          *
 259          * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
 260          * current transaction(s) think is potentially live, or InvalidMultiXactId
 261          * when not in a transaction or not in a transaction that's paid any
 262          * attention to MultiXacts yet.  This is computed when first needed in a
 263          * given transaction, and cleared at transaction end.  We can compute it
 264          * as the minimum of the valid OldestMemberMXactId[] entries at the time
 265          * we compute it (using nextMXact if none are valid).  Each backend is
 266          * required not to attempt to access any SLRU data for MultiXactIds older
 267          * than its own OldestVisibleMXactId[] setting; this is necessary because
 268          * the checkpointer could truncate away such data at any instant.
 269          *
 270          * The oldest valid value among all of the OldestMemberMXactId[] and
 271          * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
 272          * possible value still having any live member transaction.  Subtracting
 273          * vacuum_multixact_freeze_min_age from that value we obtain the freezing
 274          * point for multixacts for that table.  Any value older than that is
 275          * removed from tuple headers (or "frozen"; see FreezeMultiXactId.  Note
 276          * that multis that have member xids that are older than the cutoff point
 277          * for xids must also be frozen, even if the multis themselves are newer
 278          * than the multixid cutoff point).  Whenever a full table vacuum happens,
 279          * the freezing point so computed is used as the new pg_class.relminmxid
 280          * value.  The minimum of all those values in a database is stored as
 281          * pg_database.datminmxid.  In turn, the minimum of all of those values is
 282          * stored in pg_control and used as truncation point for pg_multixact.  At
 283          * checkpoint or restartpoint, unneeded segments are removed.
 284          */
 285         MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
 286 } MultiXactStateData;
 287
 288 /*
 289  * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays.
 290  * Valid elements are (1..MaxOldestSlot); element 0 is never used.
 291  */
 292 #define MaxOldestSlot   (MaxBackends + max_prepared_xacts)
 293
 294 /* Pointers to the state data in shared memory */
 295 static MultiXactStateData *MultiXactState;
 296 static MultiXactId *OldestMemberMXactId;
 297 static MultiXactId *OldestVisibleMXactId;
 298
 299
 300 /*
 301  * Definitions for the backend-local MultiXactId cache.
 302  *
 303  * We use this cache to store known MultiXacts, so we don't need to go to
 304  * SLRU areas every time.
 305  *
 306  * The cache lasts for the duration of a single transaction, the rationale
 307  * for this being that most entries will contain our own TransactionId and
 308  * so they will be uninteresting by the time our next transaction starts.
 309  * (XXX not clear that this is correct --- other members of the MultiXact
 310  * could hang around longer than we did.  However, it's not clear what a
 311  * better policy for flushing old cache entries would be.)      FIXME actually
 312  * this is plain wrong now that multixact's may contain update Xids.
 313  *
 314  * We allocate the cache entries in a memory context that is deleted at
 315  * transaction end, so we don't need to do retail freeing of entries.
 316  */
 317 typedef struct mXactCacheEnt
 318 {
 319         MultiXactId multi;
 320         int                     nmembers;
 321         dlist_node      node;
 322         MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 323 } mXactCacheEnt;
 324
 325 #define MAX_CACHE_ENTRIES       256
 326 static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
 327 static int      MXactCacheMembers = 0;
 328 static MemoryContext MXactContext = NULL;
 329
 330 #ifdef MULTIXACT_DEBUG
 331 #define debug_elog2(a,b) elog(a,b)
 332 #define debug_elog3(a,b,c) elog(a,b,c)
 333 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
 334 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
 335 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
 336 #else
 337 #define debug_elog2(a,b)
 338 #define debug_elog3(a,b,c)
 339 #define debug_elog4(a,b,c,d)
 340 #define debug_elog5(a,b,c,d,e)
 341 #define debug_elog6(a,b,c,d,e,f)
 342 #endif
 343
 344 /* internal MultiXactId management */
 345 static void MultiXactIdSetOldestVisible(void);
 346 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 347                                    int nmembers, MultiXactMember *members);
 348 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
 349
 350 /* MultiXact cache management */
 351 static int      mxactMemberComparator(const void *arg1, const void *arg2);
 352 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
 353 static int      mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
 354 static void mXactCachePut(MultiXactId multi, int nmembers,
 355                           MultiXactMember *members);
 356
 357 static char *mxstatus_to_string(MultiXactStatus status);
 358
 359 /* management of SLRU infrastructure */
 360 static int      ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
 361 static int      ZeroMultiXactMemberPage(int pageno, bool writeXlog);
 362 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
 363 static bool MultiXactMemberPagePrecedes(int page1, int page2);
 364 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 365                                                 MultiXactOffset offset2);
 366 static void ExtendMultiXactOffset(MultiXactId multi);
 367 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
 368 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
 369                                                  MultiXactOffset start, uint32 distance);
 370 static bool SetOffsetVacuumLimit(void);
 371 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
 372 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
 373 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
 374                                           MultiXactId startOff, MultiXactId endOff,
 375                                           MultiXactOffset startMemb, MultiXactOffset endMemb);
 376
 377
 378 /*
 379  * MultiXactIdCreate
 380  *              Construct a MultiXactId representing two TransactionIds.
 381  *
 382  * The two XIDs must be different, or be requesting different statuses.
 383  *
 384  * NB - we don't worry about our local MultiXactId cache here, because that
 385  * is handled by the lower-level routines.
 386  */
 387 MultiXactId
 388 MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
 389                                   TransactionId xid2, MultiXactStatus status2)
 390 {
 391         MultiXactId newMulti;
 392         MultiXactMember members[2];
 393
 394         AssertArg(TransactionIdIsValid(xid1));
 395         AssertArg(TransactionIdIsValid(xid2));
 396
 397         Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
 398
 399         /* MultiXactIdSetOldestMember() must have been called already. */
 400         Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
 401
 402         /*
 403          * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
 404          * are still running.  In typical usage, xid2 will be our own XID and the
 405          * caller just did a check on xid1, so it'd be wasted effort.
 406          */
 407
 408         members[0].xid = xid1;
 409         members[0].status = status1;
 410         members[1].xid = xid2;
 411         members[1].status = status2;
 412
 413         newMulti = MultiXactIdCreateFromMembers(2, members);
 414
 415         debug_elog3(DEBUG2, "Create: %s",
 416                                 mxid_to_string(newMulti, 2, members));
 417
 418         return newMulti;
 419 }
 420
 421 /*
 422  * MultiXactIdExpand
 423  *              Add a TransactionId to a pre-existing MultiXactId.
 424  *
 425  * If the TransactionId is already a member of the passed MultiXactId with the
 426  * same status, just return it as-is.
 427  *
 428  * Note that we do NOT actually modify the membership of a pre-existing
 429  * MultiXactId; instead we create a new one.  This is necessary to avoid
 430  * a race condition against code trying to wait for one MultiXactId to finish;
 431  * see notes in heapam.c.
 432  *
 433  * NB - we don't worry about our local MultiXactId cache here, because that
 434  * is handled by the lower-level routines.
 435  *
 436  * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
 437  * one upgraded by pg_upgrade from a cluster older than this feature) are not
 438  * passed in.
 439  */
 440 MultiXactId
 441 MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 442 {
 443         MultiXactId newMulti;
 444         MultiXactMember *members;
 445         MultiXactMember *newMembers;
 446         int                     nmembers;
 447         int                     i;
 448         int                     j;
 449
 450         AssertArg(MultiXactIdIsValid(multi));
 451         AssertArg(TransactionIdIsValid(xid));
 452
 453         /* MultiXactIdSetOldestMember() must have been called already. */
 454         Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
 455
 456         debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
 457                                 multi, xid, mxstatus_to_string(status));
 458
 459         /*
 460          * Note: we don't allow for old multis here.  The reason is that the only
 461          * caller of this function does a check that the multixact is no longer
 462          * running.
 463          */
 464         nmembers = GetMultiXactIdMembers(multi, &members, false, false);
 465
 466         if (nmembers < 0)
 467         {
 468                 MultiXactMember member;
 469
 470                 /*
 471                  * The MultiXactId is obsolete.  This can only happen if all the
 472                  * MultiXactId members stop running between the caller checking and
 473                  * passing it to us.  It would be better to return that fact to the
 474                  * caller, but it would complicate the API and it's unlikely to happen
 475                  * too often, so just deal with it by creating a singleton MultiXact.
 476                  */
 477                 member.xid = xid;
 478                 member.status = status;
 479                 newMulti = MultiXactIdCreateFromMembers(1, &member);
 480
 481                 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 482                                         multi, newMulti);
 483                 return newMulti;
 484         }
 485
 486         /*
 487          * If the TransactionId is already a member of the MultiXactId with the
 488          * same status, just return the existing MultiXactId.
 489          */
 490         for (i = 0; i < nmembers; i++)
 491         {
 492                 if (TransactionIdEquals(members[i].xid, xid) &&
 493                         (members[i].status == status))
 494                 {
 495                         debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
 496                                                 xid, multi);
 497                         pfree(members);
 498                         return multi;
 499                 }
 500         }
 501
 502         /*
 503          * Determine which of the members of the MultiXactId are still of
 504          * interest. This is any running transaction, and also any transaction
 505          * that grabbed something stronger than just a lock and was committed. (An
 506          * update that aborted is of no interest here; and having more than one
 507          * update Xid in a multixact would cause errors elsewhere.)
 508          *
 509          * Removing dead members is not just an optimization: freezing of tuples
 510          * whose Xmax are multis depends on this behavior.
 511          *
 512          * Note we have the same race condition here as above: j could be 0 at the
 513          * end of the loop.
 514          */
 515         newMembers = (MultiXactMember *)
 516                 palloc(sizeof(MultiXactMember) * (nmembers + 1));
 517
 518         for (i = 0, j = 0; i < nmembers; i++)
 519         {
 520                 if (TransactionIdIsInProgress(members[i].xid) ||
 521                         (ISUPDATE_from_mxstatus(members[i].status) &&
 522                          TransactionIdDidCommit(members[i].xid)))
 523                 {
 524                         newMembers[j].xid = members[i].xid;
 525                         newMembers[j++].status = members[i].status;
 526                 }
 527         }
 528
 529         newMembers[j].xid = xid;
 530         newMembers[j++].status = status;
 531         newMulti = MultiXactIdCreateFromMembers(j, newMembers);
 532
 533         pfree(members);
 534         pfree(newMembers);
 535
 536         debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
 537
 538         return newMulti;
 539 }
 540
 541 /*
 542  * MultiXactIdIsRunning
 543  *              Returns whether a MultiXactId is "running".
 544  *
 545  * We return true if at least one member of the given MultiXactId is still
 546  * running.  Note that a "false" result is certain not to change,
 547  * because it is not legal to add members to an existing MultiXactId.
 548  *
 549  * Caller is expected to have verified that the multixact does not come from
 550  * a pg_upgraded share-locked tuple.
 551  */
 552 bool
 553 MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
 554 {
 555         MultiXactMember *members;
 556         int                     nmembers;
 557         int                     i;
 558
 559         debug_elog3(DEBUG2, "IsRunning %u?", multi);
 560
 561         /*
 562          * "false" here means we assume our callers have checked that the given
 563          * multi cannot possibly come from a pg_upgraded database.
 564          */
 565         nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
 566
 567         if (nmembers <= 0)
 568         {
 569                 debug_elog2(DEBUG2, "IsRunning: no members");
 570                 return false;
 571         }
 572
 573         /*
 574          * Checking for myself is cheap compared to looking in shared memory;
 575          * return true if any live subtransaction of the current top-level
 576          * transaction is a member.
 577          *
 578          * This is not needed for correctness, it's just a fast path.
 579          */
 580         for (i = 0; i < nmembers; i++)
 581         {
 582                 if (TransactionIdIsCurrentTransactionId(members[i].xid))
 583                 {
 584                         debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
 585                         pfree(members);
 586                         return true;
 587                 }
 588         }
 589
 590         /*
 591          * This could be made faster by having another entry point in procarray.c,
 592          * walking the PGPROC array only once for all the members.  But in most
 593          * cases nmembers should be small enough that it doesn't much matter.
 594          */
 595         for (i = 0; i < nmembers; i++)
 596         {
 597                 if (TransactionIdIsInProgress(members[i].xid))
 598                 {
 599                         debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
 600                                                 i, members[i].xid);
 601                         pfree(members);
 602                         return true;
 603                 }
 604         }
 605
 606         pfree(members);
 607
 608         debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
 609
 610         return false;
 611 }
 612
 613 /*
 614  * MultiXactIdSetOldestMember
 615  *              Save the oldest MultiXactId this transaction could be a member of.
 616  *
 617  * We set the OldestMemberMXactId for a given transaction the first time it's
 618  * going to do some operation that might require a MultiXactId (tuple lock,
 619  * update or delete).  We need to do this even if we end up using a
 620  * TransactionId instead of a MultiXactId, because there is a chance that
 621  * another transaction would add our XID to a MultiXactId.
 622  *
 623  * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
 624  * be called just before doing any such possibly-MultiXactId-able operation.
 625  */
 626 void
 627 MultiXactIdSetOldestMember(void)
 628 {
 629         if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
 630         {
 631                 MultiXactId nextMXact;
 632
 633                 /*
 634                  * You might think we don't need to acquire a lock here, since
 635                  * fetching and storing of TransactionIds is probably atomic, but in
 636                  * fact we do: suppose we pick up nextMXact and then lose the CPU for
 637                  * a long time.  Someone else could advance nextMXact, and then
 638                  * another someone else could compute an OldestVisibleMXactId that
 639                  * would be after the value we are going to store when we get control
 640                  * back.  Which would be wrong.
 641                  *
 642                  * Note that a shared lock is sufficient, because it's enough to stop
 643                  * someone from advancing nextMXact; and nobody else could be trying
 644                  * to write to our OldestMember entry, only reading (and we assume
 645                  * storing it is atomic.)
 646                  */
 647                 LWLockAcquire(MultiXactGenLock, LW_SHARED);
 648
 649                 /*
 650                  * We have to beware of the possibility that nextMXact is in the
 651                  * wrapped-around state.  We don't fix the counter itself here, but we
 652                  * must be sure to store a valid value in our array entry.
 653                  */
 654                 nextMXact = MultiXactState->nextMXact;
 655                 if (nextMXact < FirstMultiXactId)
 656                         nextMXact = FirstMultiXactId;
 657
 658                 OldestMemberMXactId[MyBackendId] = nextMXact;
 659
 660                 LWLockRelease(MultiXactGenLock);
 661
 662                 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
 663                                         MyBackendId, nextMXact);
 664         }
 665 }
 666
 667 /*
 668  * MultiXactIdSetOldestVisible
 669  *              Save the oldest MultiXactId this transaction considers possibly live.
 670  *
 671  * We set the OldestVisibleMXactId for a given transaction the first time
 672  * it's going to inspect any MultiXactId.  Once we have set this, we are
 673  * guaranteed that the checkpointer won't truncate off SLRU data for
 674  * MultiXactIds at or after our OldestVisibleMXactId.
 675  *
 676  * The value to set is the oldest of nextMXact and all the valid per-backend
 677  * OldestMemberMXactId[] entries.  Because of the locking we do, we can be
 678  * certain that no subsequent call to MultiXactIdSetOldestMember can set
 679  * an OldestMemberMXactId[] entry older than what we compute here.  Therefore
 680  * there is no live transaction, now or later, that can be a member of any
 681  * MultiXactId older than the OldestVisibleMXactId we compute here.
 682  */
 683 static void
 684 MultiXactIdSetOldestVisible(void)
 685 {
 686         if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
 687         {
 688                 MultiXactId oldestMXact;
 689                 int                     i;
 690
 691                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 692
 693                 /*
 694                  * We have to beware of the possibility that nextMXact is in the
 695                  * wrapped-around state.  We don't fix the counter itself here, but we
 696                  * must be sure to store a valid value in our array entry.
 697                  */
 698                 oldestMXact = MultiXactState->nextMXact;
 699                 if (oldestMXact < FirstMultiXactId)
 700                         oldestMXact = FirstMultiXactId;
 701
 702                 for (i = 1; i <= MaxOldestSlot; i++)
 703                 {
 704                         MultiXactId thisoldest = OldestMemberMXactId[i];
 705
 706                         if (MultiXactIdIsValid(thisoldest) &&
 707                                 MultiXactIdPrecedes(thisoldest, oldestMXact))
 708                                 oldestMXact = thisoldest;
 709                 }
 710
 711                 OldestVisibleMXactId[MyBackendId] = oldestMXact;
 712
 713                 LWLockRelease(MultiXactGenLock);
 714
 715                 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
 716                                         MyBackendId, oldestMXact);
 717         }
 718 }
 719
 720 /*
 721  * ReadNextMultiXactId
 722  *              Return the next MultiXactId to be assigned, but don't allocate it
 723  */
 724 MultiXactId
 725 ReadNextMultiXactId(void)
 726 {
 727         MultiXactId mxid;
 728
 729         /* XXX we could presumably do this without a lock. */
 730         LWLockAcquire(MultiXactGenLock, LW_SHARED);
 731         mxid = MultiXactState->nextMXact;
 732         LWLockRelease(MultiXactGenLock);
 733
 734         if (mxid < FirstMultiXactId)
 735                 mxid = FirstMultiXactId;
 736
 737         return mxid;
 738 }
 739
 740 /*
 741  * MultiXactIdCreateFromMembers
 742  *              Make a new MultiXactId from the specified set of members
 743  *
 744  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
 745  * given TransactionIds as members.  Returns the newly created MultiXactId.
 746  *
 747  * NB: the passed members[] array will be sorted in-place.
 748  */
 749 MultiXactId
 750 MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
 751 {
 752         MultiXactId multi;
 753         MultiXactOffset offset;
 754         xl_multixact_create xlrec;
 755
 756         debug_elog3(DEBUG2, "Create: %s",
 757                                 mxid_to_string(InvalidMultiXactId, nmembers, members));
 758
 759         /*
 760          * See if the same set of members already exists in our cache; if so, just
 761          * re-use that MultiXactId.  (Note: it might seem that looking in our
 762          * cache is insufficient, and we ought to search disk to see if a
 763          * duplicate definition already exists.  But since we only ever create
 764          * MultiXacts containing our own XID, in most cases any such MultiXacts
 765          * were in fact created by us, and so will be in our cache.  There are
 766          * corner cases where someone else added us to a MultiXact without our
 767          * knowledge, but it's not worth checking for.)
 768          */
 769         multi = mXactCacheGetBySet(nmembers, members);
 770         if (MultiXactIdIsValid(multi))
 771         {
 772                 debug_elog2(DEBUG2, "Create: in cache!");
 773                 return multi;
 774         }
 775
 776         /* Verify that there is a single update Xid among the given members. */
 777         {
 778                 int                     i;
 779                 bool            has_update = false;
 780
 781                 for (i = 0; i < nmembers; i++)
 782                 {
 783                         if (ISUPDATE_from_mxstatus(members[i].status))
 784                         {
 785                                 if (has_update)
 786                                         elog(ERROR, "new multixact has more than one updating member");
 787                                 has_update = true;
 788                         }
 789                 }
 790         }
 791
 792         /*
 793          * Assign the MXID and offsets range to use, and make sure there is space
 794          * in the OFFSETs and MEMBERs files.  NB: this routine does
 795          * START_CRIT_SECTION().
 796          *
 797          * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
 798          * that we've called MultiXactIdSetOldestMember here.  This is because
 799          * this routine is used in some places to create new MultiXactIds of which
 800          * the current backend is not a member, notably during freezing of multis
 801          * in vacuum.  During vacuum, in particular, it would be unacceptable to
 802          * keep OldestMulti set, in case it runs for long.
 803          */
 804         multi = GetNewMultiXactId(nmembers, &offset);
 805
 806         /*
 807          * Make an XLOG entry describing the new MXID.
 808          *
 809          * Note: we need not flush this XLOG entry to disk before proceeding. The
 810          * only way for the MXID to be referenced from any data page is for
 811          * heap_lock_tuple() to have put it there, and heap_lock_tuple() generates
 812          * an XLOG record that must follow ours.  The normal LSN interlock between
 813          * the data page and that XLOG record will ensure that our XLOG record
 814          * reaches disk first.  If the SLRU members/offsets data reaches disk
 815          * sooner than the XLOG record, we do not care because we'll overwrite it
 816          * with zeroes unless the XLOG record is there too; see notes at top of
 817          * this file.
 818          */
 819         xlrec.mid = multi;
 820         xlrec.moff = offset;
 821         xlrec.nmembers = nmembers;
 822
 823         /*
 824          * XXX Note: there's a lot of padding space in MultiXactMember.  We could
 825          * find a more compact representation of this Xlog record -- perhaps all
 826          * the status flags in one XLogRecData, then all the xids in another one?
 827          * Not clear that it's worth the trouble though.
 828          */
 829         XLogBeginInsert();
 830         XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
 831         XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
 832
 833         (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
 834
 835         /* Now enter the information into the OFFSETs and MEMBERs logs */
 836         RecordNewMultiXact(multi, offset, nmembers, members);
 837
 838         /* Done with critical section */
 839         END_CRIT_SECTION();
 840
 841         /* Store the new MultiXactId in the local cache, too */
 842         mXactCachePut(multi, nmembers, members);
 843
 844         debug_elog2(DEBUG2, "Create: all done");
 845
 846         return multi;
 847 }
 848
 849 /*
 850  * RecordNewMultiXact
 851  *              Write info about a new multixact into the offsets and members files
 852  *
 853  * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
 854  * use it.
 855  */
 856 static void
 857 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 858                                    int nmembers, MultiXactMember *members)
 859 {
 860         int                     pageno;
 861         int                     prev_pageno;
 862         int                     entryno;
 863         int                     slotno;
 864         MultiXactOffset *offptr;
 865         int                     i;
 866
 867         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
 868
 869         pageno = MultiXactIdToOffsetPage(multi);
 870         entryno = MultiXactIdToOffsetEntry(multi);
 871
 872         /*
 873          * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
 874          * to complain about if there's any I/O error.  This is kinda bogus, but
 875          * since the errors will always give the full pathname, it should be clear
 876          * enough that a MultiXactId is really involved.  Perhaps someday we'll
 877          * take the trouble to generalize the slru.c error reporting code.
 878          */
 879         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
 880         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 881         offptr += entryno;
 882
 883         *offptr = offset;
 884
 885         MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 886
 887         /* Exchange our lock */
 888         LWLockRelease(MultiXactOffsetControlLock);
 889
 890         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
 891
 892         prev_pageno = -1;
 893
 894         for (i = 0; i < nmembers; i++, offset++)
 895         {
 896                 TransactionId *memberptr;
 897                 uint32     *flagsptr;
 898                 uint32          flagsval;
 899                 int                     bshift;
 900                 int                     flagsoff;
 901                 int                     memberoff;
 902
 903                 Assert(members[i].status <= MultiXactStatusUpdate);
 904
 905                 pageno = MXOffsetToMemberPage(offset);
 906                 memberoff = MXOffsetToMemberOffset(offset);
 907                 flagsoff = MXOffsetToFlagsOffset(offset);
 908                 bshift = MXOffsetToFlagsBitShift(offset);
 909
 910                 if (pageno != prev_pageno)
 911                 {
 912                         slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
 913                         prev_pageno = pageno;
 914                 }
 915
 916                 memberptr = (TransactionId *)
 917                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 918
 919                 *memberptr = members[i].xid;
 920
 921                 flagsptr = (uint32 *)
 922                         (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
 923
 924                 flagsval = *flagsptr;
 925                 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
 926                 flagsval |= (members[i].status << bshift);
 927                 *flagsptr = flagsval;
 928
 929                 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
 930         }
 931
 932         LWLockRelease(MultiXactMemberControlLock);
 933 }
 934
 935 /*
 936  * GetNewMultiXactId
 937  *              Get the next MultiXactId.
 938  *
 939  * Also, reserve the needed amount of space in the "members" area.  The
 940  * starting offset of the reserved space is returned in *offset.
 941  *
 942  * This may generate XLOG records for expansion of the offsets and/or members
 943  * files.  Unfortunately, we have to do that while holding MultiXactGenLock
 944  * to avoid race conditions --- the XLOG record for zeroing a page must appear
 945  * before any backend can possibly try to store data in that page!
 946  *
 947  * We start a critical section before advancing the shared counters.  The
 948  * caller must end the critical section after writing SLRU data.
 949  */
 950 static MultiXactId
 951 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 952 {
 953         MultiXactId result;
 954         MultiXactOffset nextOffset;
 955
 956         debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
 957
 958         /* safety check, we should never get this far in a HS slave */
 959         if (RecoveryInProgress())
 960                 elog(ERROR, "cannot assign MultiXactIds during recovery");
 961
 962         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 963
 964         /* Handle wraparound of the nextMXact counter */
 965         if (MultiXactState->nextMXact < FirstMultiXactId)
 966                 MultiXactState->nextMXact = FirstMultiXactId;
 967
 968         /* Assign the MXID */
 969         result = MultiXactState->nextMXact;
 970
 971         /*----------
 972          * Check to see if it's safe to assign another MultiXactId.  This protects
 973          * against catastrophic data loss due to multixact wraparound.  The basic
 974          * rules are:
 975          *
 976          * If we're past multiVacLimit or the safe threshold for member storage
 977          * space, or we don't know what the safe threshold for member storage is,
 978          * start trying to force autovacuum cycles.
 979          * If we're past multiWarnLimit, start issuing warnings.
 980          * If we're past multiStopLimit, refuse to create new MultiXactIds.
 981          *
 982          * Note these are pretty much the same protections in GetNewTransactionId.
 983          *----------
 984          */
 985         if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
 986         {
 987                 /*
 988                  * For safety's sake, we release MultiXactGenLock while sending
 989                  * signals, warnings, etc.  This is not so much because we care about
 990                  * preserving concurrency in this situation, as to avoid any
 991                  * possibility of deadlock while doing get_database_name(). First,
 992                  * copy all the shared values we'll need in this path.
 993                  */
 994                 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
 995                 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
 996                 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
 997                 Oid                     oldest_datoid = MultiXactState->oldestMultiXactDB;
 998
 999                 LWLockRelease(MultiXactGenLock);
1000
1001                 if (IsUnderPostmaster &&
1002                         !MultiXactIdPrecedes(result, multiStopLimit))
1003                 {
1004                         char       *oldest_datname = get_database_name(oldest_datoid);
1005
1006                         /*
1007                          * Immediately kick autovacuum into action as we're already
1008                          * in ERROR territory.
1009                          */
1010                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1011
1012                         /* complain even if that DB has disappeared */
1013                         if (oldest_datname)
1014                                 ereport(ERROR,
1015                                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1016                                                  errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1017                                                                 oldest_datname),
1018                                  errhint("Execute a database-wide VACUUM in that database.\n"
1019                                                  "You might also need to commit or roll back old prepared transactions.")));
1020                         else
1021                                 ereport(ERROR,
1022                                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1023                                                  errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
1024                                                                 oldest_datoid),
1025                                  errhint("Execute a database-wide VACUUM in that database.\n"
1026                                                  "You might also need to commit or roll back old prepared transactions.")));
1027                 }
1028
1029                 /*
1030                  * To avoid swamping the postmaster with signals, we issue the autovac
1031                  * request only once per 64K multis generated.  This still gives
1032                  * plenty of chances before we get into real trouble.
1033                  */
1034                 if (IsUnderPostmaster && (result % 65536) == 0)
1035                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1036
1037                 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1038                 {
1039                         char       *oldest_datname = get_database_name(oldest_datoid);
1040
1041                         /* complain even if that DB has disappeared */
1042                         if (oldest_datname)
1043                                 ereport(WARNING,
1044                                                 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1045                                                                            "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1046                                                                            multiWrapLimit - result,
1047                                                                            oldest_datname,
1048                                                                            multiWrapLimit - result),
1049                                  errhint("Execute a database-wide VACUUM in that database.\n"
1050                                                  "You might also need to commit or roll back old prepared transactions.")));
1051                         else
1052                                 ereport(WARNING,
1053                                                 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1054                                                                            "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1055                                                                            multiWrapLimit - result,
1056                                                                            oldest_datoid,
1057                                                                            multiWrapLimit - result),
1058                                  errhint("Execute a database-wide VACUUM in that database.\n"
1059                                                  "You might also need to commit or roll back old prepared transactions.")));
1060                 }
1061
1062                 /* Re-acquire lock and start over */
1063                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1064                 result = MultiXactState->nextMXact;
1065                 if (result < FirstMultiXactId)
1066                         result = FirstMultiXactId;
1067         }
1068
1069         /* Make sure there is room for the MXID in the file.  */
1070         ExtendMultiXactOffset(result);
1071
1072         /*
1073          * Reserve the members space, similarly to above.  Also, be careful not to
1074          * return zero as the starting offset for any multixact. See
1075          * GetMultiXactIdMembers() for motivation.
1076          */
1077         nextOffset = MultiXactState->nextOffset;
1078         if (nextOffset == 0)
1079         {
1080                 *offset = 1;
1081                 nmembers++;                             /* allocate member slot 0 too */
1082         }
1083         else
1084                 *offset = nextOffset;
1085
1086         /*----------
1087          * Protect against overrun of the members space as well, with the
1088          * following rules:
1089          *
1090          * If we're past offsetStopLimit, refuse to generate more multis.
1091          * If we're close to offsetStopLimit, emit a warning.
1092          *
1093          * Arbitrarily, we start emitting warnings when we're 20 segments or less
1094          * from offsetStopLimit.
1095          *
1096          * Note we haven't updated the shared state yet, so if we fail at this
1097          * point, the multixact ID we grabbed can still be used by the next guy.
1098          *
1099          * Note that there is no point in forcing autovacuum runs here: the
1100          * multixact freeze settings would have to be reduced for that to have any
1101          * effect.
1102          *----------
1103          */
1104 #define OFFSET_WARN_SEGMENTS    20
1105         if (MultiXactState->oldestOffsetKnown &&
1106                 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1107                                                                  nmembers))
1108         {
1109                 /* see comment in the corresponding offsets wraparound case */
1110                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1111
1112                 ereport(ERROR,
1113                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1114                                  errmsg("multixact \"members\" limit exceeded"),
1115                                  errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1116                                                                   "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1117                                                         MultiXactState->offsetStopLimit - nextOffset - 1,
1118                                                                   nmembers,
1119                                                    MultiXactState->offsetStopLimit - nextOffset - 1),
1120                                  errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
1121                                                  MultiXactState->oldestMultiXactDB)));
1122         }
1123
1124         /*
1125          * Check whether we should kick autovacuum into action, to prevent members
1126          * wraparound. NB we use a much larger window to trigger autovacuum than
1127          * just the warning limit. The warning is just a measure of last resort -
1128          * this is in line with GetNewTransactionId's behaviour.
1129          */
1130         if (!MultiXactState->oldestOffsetKnown ||
1131                 (MultiXactState->nextOffset - MultiXactState->oldestOffset
1132                  > MULTIXACT_MEMBER_SAFE_THRESHOLD))
1133         {
1134                 /*
1135                  * To avoid swamping the postmaster with signals, we issue the autovac
1136                  * request only when crossing a segment boundary. With default
1137                  * compilation settings that's rougly after 50k members.  This still
1138                  * gives plenty of chances before we get into real trouble.
1139                  */
1140                 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1141                         (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1142                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1143         }
1144
1145         if (MultiXactState->oldestOffsetKnown &&
1146                 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1147                                                                  nextOffset,
1148                                                                  nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1149                 ereport(WARNING,
1150                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1151                                  errmsg("database with OID %u must be vacuumed before %d more multixact members are used",
1152                                                 MultiXactState->oldestMultiXactDB,
1153                                         MultiXactState->offsetStopLimit - nextOffset + nmembers),
1154                                  errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.")));
1155
1156         ExtendMultiXactMember(nextOffset, nmembers);
1157
1158         /*
1159          * Critical section from here until caller has written the data into the
1160          * just-reserved SLRU space; we don't want to error out with a partly
1161          * written MultiXact structure.  (In particular, failing to write our
1162          * start offset after advancing nextMXact would effectively corrupt the
1163          * previous MultiXact.)
1164          */
1165         START_CRIT_SECTION();
1166
1167         /*
1168          * Advance counters.  As in GetNewTransactionId(), this must not happen
1169          * until after file extension has succeeded!
1170          *
1171          * We don't care about MultiXactId wraparound here; it will be handled by
1172          * the next iteration.  But note that nextMXact may be InvalidMultiXactId
1173          * or the first value on a segment-beginning page after this routine
1174          * exits, so anyone else looking at the variable must be prepared to deal
1175          * with either case.  Similarly, nextOffset may be zero, but we won't use
1176          * that as the actual start offset of the next multixact.
1177          */
1178         (MultiXactState->nextMXact)++;
1179
1180         MultiXactState->nextOffset += nmembers;
1181
1182         LWLockRelease(MultiXactGenLock);
1183
1184         debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1185         return result;
1186 }
1187
1188 /*
1189  * GetMultiXactIdMembers
1190  *              Returns the set of MultiXactMembers that make up a MultiXactId
1191  *
1192  * If the given MultiXactId is older than the value we know to be oldest, we
1193  * return -1.  The caller is expected to allow that only in permissible cases,
1194  * i.e. when the infomask lets it presuppose that the tuple had been
1195  * share-locked before a pg_upgrade; this means that the HEAP_XMAX_LOCK_ONLY
1196  * needs to be set, but HEAP_XMAX_KEYSHR_LOCK and HEAP_XMAX_EXCL_LOCK are not
1197  * set.
1198  *
1199  * Other border conditions, such as trying to read a value that's larger than
1200  * the value currently known as the next to assign, raise an error.  Previously
1201  * these also returned -1, but since this can lead to the wrong visibility
1202  * results, it is dangerous to do that.
1203  *
1204  * onlyLock must be set to true if caller is certain that the given multi
1205  * is used only to lock tuples; can be false without loss of correctness,
1206  * but passing a true means we can return quickly without checking for
1207  * old updates.
1208  */
1209 int
1210 GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1211                                           bool allow_old, bool onlyLock)
1212 {
1213         int                     pageno;
1214         int                     prev_pageno;
1215         int                     entryno;
1216         int                     slotno;
1217         MultiXactOffset *offptr;
1218         MultiXactOffset offset;
1219         int                     length;
1220         int                     truelength;
1221         int                     i;
1222         MultiXactId oldestMXact;
1223         MultiXactId nextMXact;
1224         MultiXactId tmpMXact;
1225         MultiXactOffset nextOffset;
1226         MultiXactMember *ptr;
1227
1228         debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1229
1230         if (!MultiXactIdIsValid(multi))
1231                 return -1;
1232
1233         /* See if the MultiXactId is in the local cache */
1234         length = mXactCacheGetById(multi, members);
1235         if (length >= 0)
1236         {
1237                 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1238                                         mxid_to_string(multi, length, *members));
1239                 return length;
1240         }
1241
1242         /* Set our OldestVisibleMXactId[] entry if we didn't already */
1243         MultiXactIdSetOldestVisible();
1244
1245         /*
1246          * If we know the multi is used only for locking and not for updates, then
1247          * we can skip checking if the value is older than our oldest visible
1248          * multi.  It cannot possibly still be running.
1249          */
1250         if (onlyLock &&
1251                 MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
1252         {
1253                 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1254                 *members = NULL;
1255                 return -1;
1256         }
1257
1258         /*
1259          * We check known limits on MultiXact before resorting to the SLRU area.
1260          *
1261          * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1262          * useful; it has already been removed, or will be removed shortly, by
1263          * truncation.  Returning the wrong values could lead to an incorrect
1264          * visibility result.  However, to support pg_upgrade we need to allow an
1265          * empty set to be returned regardless, if the caller is willing to accept
1266          * it; the caller is expected to check that it's an allowed condition
1267          * (such as ensuring that the infomask bits set on the tuple are
1268          * consistent with the pg_upgrade scenario).  If the caller is expecting
1269          * this to be called only on recently created multis, then we raise an
1270          * error.
1271          *
1272          * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
1273          * seen, it implies undetected ID wraparound has occurred.  This raises a
1274          * hard error.
1275          *
1276          * Shared lock is enough here since we aren't modifying any global state.
1277          * Acquire it just long enough to grab the current counter values.  We may
1278          * need both nextMXact and nextOffset; see below.
1279          */
1280         LWLockAcquire(MultiXactGenLock, LW_SHARED);
1281
1282         oldestMXact = MultiXactState->oldestMultiXactId;
1283         nextMXact = MultiXactState->nextMXact;
1284         nextOffset = MultiXactState->nextOffset;
1285
1286         LWLockRelease(MultiXactGenLock);
1287
1288         if (MultiXactIdPrecedes(multi, oldestMXact))
1289         {
1290                 ereport(allow_old ? DEBUG1 : ERROR,
1291                                 (errcode(ERRCODE_INTERNAL_ERROR),
1292                  errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1293                                 multi)));
1294                 return -1;
1295         }
1296
1297         if (!MultiXactIdPrecedes(multi, nextMXact))
1298                 ereport(ERROR,
1299                                 (errcode(ERRCODE_INTERNAL_ERROR),
1300                                  errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1301                                                 multi)));
1302
1303         /*
1304          * Find out the offset at which we need to start reading MultiXactMembers
1305          * and the number of members in the multixact.  We determine the latter as
1306          * the difference between this multixact's starting offset and the next
1307          * one's.  However, there are some corner cases to worry about:
1308          *
1309          * 1. This multixact may be the latest one created, in which case there is
1310          * no next one to look at.  In this case the nextOffset value we just
1311          * saved is the correct endpoint.
1312          *
1313          * 2. The next multixact may still be in process of being filled in: that
1314          * is, another process may have done GetNewMultiXactId but not yet written
1315          * the offset entry for that ID.  In that scenario, it is guaranteed that
1316          * the offset entry for that multixact exists (because GetNewMultiXactId
1317          * won't release MultiXactGenLock until it does) but contains zero
1318          * (because we are careful to pre-zero offset pages). Because
1319          * GetNewMultiXactId will never return zero as the starting offset for a
1320          * multixact, when we read zero as the next multixact's offset, we know we
1321          * have this case.  We sleep for a bit and try again.
1322          *
1323          * 3. Because GetNewMultiXactId increments offset zero to offset one to
1324          * handle case #2, there is an ambiguity near the point of offset
1325          * wraparound.  If we see next multixact's offset is one, is that our
1326          * multixact's actual endpoint, or did it end at zero with a subsequent
1327          * increment?  We handle this using the knowledge that if the zero'th
1328          * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1329          * transaction ID so it can't be a multixact member.  Therefore, if we
1330          * read a zero from the members array, just ignore it.
1331          *
1332          * This is all pretty messy, but the mess occurs only in infrequent corner
1333          * cases, so it seems better than holding the MultiXactGenLock for a long
1334          * time on every multixact creation.
1335          */
1336 retry:
1337         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1338
1339         pageno = MultiXactIdToOffsetPage(multi);
1340         entryno = MultiXactIdToOffsetEntry(multi);
1341
1342         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1343         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1344         offptr += entryno;
1345         offset = *offptr;
1346
1347         Assert(offset != 0);
1348
1349         /*
1350          * Use the same increment rule as GetNewMultiXactId(), that is, don't
1351          * handle wraparound explicitly until needed.
1352          */
1353         tmpMXact = multi + 1;
1354
1355         if (nextMXact == tmpMXact)
1356         {
1357                 /* Corner case 1: there is no next multixact */
1358                 length = nextOffset - offset;
1359         }
1360         else
1361         {
1362                 MultiXactOffset nextMXOffset;
1363
1364                 /* handle wraparound if needed */
1365                 if (tmpMXact < FirstMultiXactId)
1366                         tmpMXact = FirstMultiXactId;
1367
1368                 prev_pageno = pageno;
1369
1370                 pageno = MultiXactIdToOffsetPage(tmpMXact);
1371                 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1372
1373                 if (pageno != prev_pageno)
1374                         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1375
1376                 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1377                 offptr += entryno;
1378                 nextMXOffset = *offptr;
1379
1380                 if (nextMXOffset == 0)
1381                 {
1382                         /* Corner case 2: next multixact is still being filled in */
1383                         LWLockRelease(MultiXactOffsetControlLock);
1384                         CHECK_FOR_INTERRUPTS();
1385                         pg_usleep(1000L);
1386                         goto retry;
1387                 }
1388
1389                 length = nextMXOffset - offset;
1390         }
1391
1392         LWLockRelease(MultiXactOffsetControlLock);
1393
1394         ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1395         *members = ptr;
1396
1397         /* Now get the members themselves. */
1398         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1399
1400         truelength = 0;
1401         prev_pageno = -1;
1402         for (i = 0; i < length; i++, offset++)
1403         {
1404                 TransactionId *xactptr;
1405                 uint32     *flagsptr;
1406                 int                     flagsoff;
1407                 int                     bshift;
1408                 int                     memberoff;
1409
1410                 pageno = MXOffsetToMemberPage(offset);
1411                 memberoff = MXOffsetToMemberOffset(offset);
1412
1413                 if (pageno != prev_pageno)
1414                 {
1415                         slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1416                         prev_pageno = pageno;
1417                 }
1418
1419                 xactptr = (TransactionId *)
1420                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1421
1422                 if (!TransactionIdIsValid(*xactptr))
1423                 {
1424                         /* Corner case 3: we must be looking at unused slot zero */
1425                         Assert(offset == 0);
1426                         continue;
1427                 }
1428
1429                 flagsoff = MXOffsetToFlagsOffset(offset);
1430                 bshift = MXOffsetToFlagsBitShift(offset);
1431                 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1432
1433                 ptr[truelength].xid = *xactptr;
1434                 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1435                 truelength++;
1436         }
1437
1438         LWLockRelease(MultiXactMemberControlLock);
1439
1440         /*
1441          * Copy the result into the local cache.
1442          */
1443         mXactCachePut(multi, truelength, ptr);
1444
1445         debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1446                                 mxid_to_string(multi, truelength, ptr));
1447         return truelength;
1448 }
1449
1450 /*
1451  * mxactMemberComparator
1452  *              qsort comparison function for MultiXactMember
1453  *
1454  * We can't use wraparound comparison for XIDs because that does not respect
1455  * the triangle inequality!  Any old sort order will do.
1456  */
1457 static int
1458 mxactMemberComparator(const void *arg1, const void *arg2)
1459 {
1460         MultiXactMember member1 = *(const MultiXactMember *) arg1;
1461         MultiXactMember member2 = *(const MultiXactMember *) arg2;
1462
1463         if (member1.xid > member2.xid)
1464                 return 1;
1465         if (member1.xid < member2.xid)
1466                 return -1;
1467         if (member1.status > member2.status)
1468                 return 1;
1469         if (member1.status < member2.status)
1470                 return -1;
1471         return 0;
1472 }
1473
1474 /*
1475  * mXactCacheGetBySet
1476  *              returns a MultiXactId from the cache based on the set of
1477  *              TransactionIds that compose it, or InvalidMultiXactId if
1478  *              none matches.
1479  *
1480  * This is helpful, for example, if two transactions want to lock a huge
1481  * table.  By using the cache, the second will use the same MultiXactId
1482  * for the majority of tuples, thus keeping MultiXactId usage low (saving
1483  * both I/O and wraparound issues).
1484  *
1485  * NB: the passed members array will be sorted in-place.
1486  */
1487 static MultiXactId
1488 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1489 {
1490         dlist_iter      iter;
1491
1492         debug_elog3(DEBUG2, "CacheGet: looking for %s",
1493                                 mxid_to_string(InvalidMultiXactId, nmembers, members));
1494
1495         /* sort the array so comparison is easy */
1496         qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1497
1498         dlist_foreach(iter, &MXactCache)
1499         {
1500                 mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1501
1502                 if (entry->nmembers != nmembers)
1503                         continue;
1504
1505                 /*
1506                  * We assume the cache entries are sorted, and that the unused bits in
1507                  * "status" are zeroed.
1508                  */
1509                 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1510                 {
1511                         debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1512                         dlist_move_head(&MXactCache, iter.cur);
1513                         return entry->multi;
1514                 }
1515         }
1516
1517         debug_elog2(DEBUG2, "CacheGet: not found :-(");
1518         return InvalidMultiXactId;
1519 }
1520
1521 /*
1522  * mXactCacheGetById
1523  *              returns the composing MultiXactMember set from the cache for a
1524  *              given MultiXactId, if present.
1525  *
1526  * If successful, *xids is set to the address of a palloc'd copy of the
1527  * MultiXactMember set.  Return value is number of members, or -1 on failure.
1528  */
1529 static int
1530 mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1531 {
1532         dlist_iter      iter;
1533
1534         debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1535
1536         dlist_foreach(iter, &MXactCache)
1537         {
1538                 mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1539
1540                 if (entry->multi == multi)
1541                 {
1542                         MultiXactMember *ptr;
1543                         Size            size;
1544
1545                         size = sizeof(MultiXactMember) * entry->nmembers;
1546                         ptr = (MultiXactMember *) palloc(size);
1547                         *members = ptr;
1548
1549                         memcpy(ptr, entry->members, size);
1550
1551                         debug_elog3(DEBUG2, "CacheGet: found %s",
1552                                                 mxid_to_string(multi,
1553                                                                            entry->nmembers,
1554                                                                            entry->members));
1555
1556                         /*
1557                          * Note we modify the list while not using a modifiable iterator.
1558                          * This is acceptable only because we exit the iteration
1559                          * immediately afterwards.
1560                          */
1561                         dlist_move_head(&MXactCache, iter.cur);
1562
1563                         return entry->nmembers;
1564                 }
1565         }
1566
1567         debug_elog2(DEBUG2, "CacheGet: not found");
1568         return -1;
1569 }
1570
1571 /*
1572  * mXactCachePut
1573  *              Add a new MultiXactId and its composing set into the local cache.
1574  */
1575 static void
1576 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1577 {
1578         mXactCacheEnt *entry;
1579
1580         debug_elog3(DEBUG2, "CachePut: storing %s",
1581                                 mxid_to_string(multi, nmembers, members));
1582
1583         if (MXactContext == NULL)
1584         {
1585                 /* The cache only lives as long as the current transaction */
1586                 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1587                 MXactContext = AllocSetContextCreate(TopTransactionContext,
1588                                                                                          "MultiXact Cache Context",
1589                                                                                          ALLOCSET_SMALL_MINSIZE,
1590                                                                                          ALLOCSET_SMALL_INITSIZE,
1591                                                                                          ALLOCSET_SMALL_MAXSIZE);
1592         }
1593
1594         entry = (mXactCacheEnt *)
1595                 MemoryContextAlloc(MXactContext,
1596                                                    offsetof(mXactCacheEnt, members) +
1597                                                    nmembers * sizeof(MultiXactMember));
1598
1599         entry->multi = multi;
1600         entry->nmembers = nmembers;
1601         memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1602
1603         /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1604         qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1605
1606         dlist_push_head(&MXactCache, &entry->node);
1607         if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES)
1608         {
1609                 dlist_node *node;
1610                 mXactCacheEnt *entry;
1611
1612                 node = dlist_tail_node(&MXactCache);
1613                 dlist_delete(node);
1614                 MXactCacheMembers--;
1615
1616                 entry = dlist_container(mXactCacheEnt, node, node);
1617                 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1618                                         entry->multi);
1619
1620                 pfree(entry);
1621         }
1622 }
1623
1624 static char *
1625 mxstatus_to_string(MultiXactStatus status)
1626 {
1627         switch (status)
1628         {
1629                 case MultiXactStatusForKeyShare:
1630                         return "keysh";
1631                 case MultiXactStatusForShare:
1632                         return "sh";
1633                 case MultiXactStatusForNoKeyUpdate:
1634                         return "fornokeyupd";
1635                 case MultiXactStatusForUpdate:
1636                         return "forupd";
1637                 case MultiXactStatusNoKeyUpdate:
1638                         return "nokeyupd";
1639                 case MultiXactStatusUpdate:
1640                         return "upd";
1641                 default:
1642                         elog(ERROR, "unrecognized multixact status %d", status);
1643                         return "";
1644         }
1645 }
1646
1647 char *
1648 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1649 {
1650         static char *str = NULL;
1651         StringInfoData buf;
1652         int                     i;
1653
1654         if (str != NULL)
1655                 pfree(str);
1656
1657         initStringInfo(&buf);
1658
1659         appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1660                                          mxstatus_to_string(members[0].status));
1661
1662         for (i = 1; i < nmembers; i++)
1663                 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1664                                                  mxstatus_to_string(members[i].status));
1665
1666         appendStringInfoChar(&buf, ']');
1667         str = MemoryContextStrdup(TopMemoryContext, buf.data);
1668         pfree(buf.data);
1669         return str;
1670 }
1671
1672 /*
1673  * AtEOXact_MultiXact
1674  *              Handle transaction end for MultiXact
1675  *
1676  * This is called at top transaction commit or abort (we don't care which).
1677  */
1678 void
1679 AtEOXact_MultiXact(void)
1680 {
1681         /*
1682          * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1683          * which should only be valid while within a transaction.
1684          *
1685          * We assume that storing a MultiXactId is atomic and so we need not take
1686          * MultiXactGenLock to do this.
1687          */
1688         OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1689         OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1690
1691         /*
1692          * Discard the local MultiXactId cache.  Since MXactContext was created as
1693          * a child of TopTransactionContext, we needn't delete it explicitly.
1694          */
1695         MXactContext = NULL;
1696         dlist_init(&MXactCache);
1697         MXactCacheMembers = 0;
1698 }
1699
1700 /*
1701  * AtPrepare_MultiXact
1702  *              Save multixact state at 2PC transaction prepare
1703  *
1704  * In this phase, we only store our OldestMemberMXactId value in the two-phase
1705  * state file.
1706  */
1707 void
1708 AtPrepare_MultiXact(void)
1709 {
1710         MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
1711
1712         if (MultiXactIdIsValid(myOldestMember))
1713                 RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1714                                                            &myOldestMember, sizeof(MultiXactId));
1715 }
1716
1717 /*
1718  * PostPrepare_MultiXact
1719  *              Clean up after successful PREPARE TRANSACTION
1720  */
1721 void
1722 PostPrepare_MultiXact(TransactionId xid)
1723 {
1724         MultiXactId myOldestMember;
1725
1726         /*
1727          * Transfer our OldestMemberMXactId value to the slot reserved for the
1728          * prepared transaction.
1729          */
1730         myOldestMember = OldestMemberMXactId[MyBackendId];
1731         if (MultiXactIdIsValid(myOldestMember))
1732         {
1733                 BackendId       dummyBackendId = TwoPhaseGetDummyBackendId(xid);
1734
1735                 /*
1736                  * Even though storing MultiXactId is atomic, acquire lock to make
1737                  * sure others see both changes, not just the reset of the slot of the
1738                  * current backend. Using a volatile pointer might suffice, but this
1739                  * isn't a hot spot.
1740                  */
1741                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1742
1743                 OldestMemberMXactId[dummyBackendId] = myOldestMember;
1744                 OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1745
1746                 LWLockRelease(MultiXactGenLock);
1747         }
1748
1749         /*
1750          * We don't need to transfer OldestVisibleMXactId value, because the
1751          * transaction is not going to be looking at any more multixacts once it's
1752          * prepared.
1753          *
1754          * We assume that storing a MultiXactId is atomic and so we need not take
1755          * MultiXactGenLock to do this.
1756          */
1757         OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1758
1759         /*
1760          * Discard the local MultiXactId cache like in AtEOX_MultiXact
1761          */
1762         MXactContext = NULL;
1763         dlist_init(&MXactCache);
1764         MXactCacheMembers = 0;
1765 }
1766
1767 /*
1768  * multixact_twophase_recover
1769  *              Recover the state of a prepared transaction at startup
1770  */
1771 void
1772 multixact_twophase_recover(TransactionId xid, uint16 info,
1773                                                    void *recdata, uint32 len)
1774 {
1775         BackendId       dummyBackendId = TwoPhaseGetDummyBackendId(xid);
1776         MultiXactId oldestMember;
1777
1778         /*
1779          * Get the oldest member XID from the state file record, and set it in the
1780          * OldestMemberMXactId slot reserved for this prepared transaction.
1781          */
1782         Assert(len == sizeof(MultiXactId));
1783         oldestMember = *((MultiXactId *) recdata);
1784
1785         OldestMemberMXactId[dummyBackendId] = oldestMember;
1786 }
1787
1788 /*
1789  * multixact_twophase_postcommit
1790  *              Similar to AtEOX_MultiXact but for COMMIT PREPARED
1791  */
1792 void
1793 multixact_twophase_postcommit(TransactionId xid, uint16 info,
1794                                                           void *recdata, uint32 len)
1795 {
1796         BackendId       dummyBackendId = TwoPhaseGetDummyBackendId(xid);
1797
1798         Assert(len == sizeof(MultiXactId));
1799
1800         OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
1801 }
1802
1803 /*
1804  * multixact_twophase_postabort
1805  *              This is actually just the same as the COMMIT case.
1806  */
1807 void
1808 multixact_twophase_postabort(TransactionId xid, uint16 info,
1809                                                          void *recdata, uint32 len)
1810 {
1811         multixact_twophase_postcommit(xid, info, recdata, len);
1812 }
1813
1814 /*
1815  * Initialization of shared memory for MultiXact.  We use two SLRU areas,
1816  * thus double memory.  Also, reserve space for the shared MultiXactState
1817  * struct and the per-backend MultiXactId arrays (two of those, too).
1818  */
1819 Size
1820 MultiXactShmemSize(void)
1821 {
1822         Size            size;
1823
1824         /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */
1825 #define SHARED_MULTIXACT_STATE_SIZE \
1826         add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \
1827                          mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1828
1829         size = SHARED_MULTIXACT_STATE_SIZE;
1830         size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
1831         size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
1832
1833         return size;
1834 }
1835
1836 void
1837 MultiXactShmemInit(void)
1838 {
1839         bool            found;
1840
1841         debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1842
1843         MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1844         MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1845
1846         SimpleLruInit(MultiXactOffsetCtl,
1847                                   "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS, 0,
1848                                   MultiXactOffsetControlLock, "pg_multixact/offsets");
1849         SimpleLruInit(MultiXactMemberCtl,
1850                                   "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS, 0,
1851                                   MultiXactMemberControlLock, "pg_multixact/members");
1852
1853         /* Initialize our shared state struct */
1854         MultiXactState = ShmemInitStruct("Shared MultiXact State",
1855                                                                          SHARED_MULTIXACT_STATE_SIZE,
1856                                                                          &found);
1857         if (!IsUnderPostmaster)
1858         {
1859                 Assert(!found);
1860
1861                 /* Make sure we zero out the per-backend state */
1862                 MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1863         }
1864         else
1865                 Assert(found);
1866
1867         /*
1868          * Set up array pointers.  Note that perBackendXactIds[0] is wasted space
1869          * since we only use indexes 1..MaxOldestSlot in each array.
1870          */
1871         OldestMemberMXactId = MultiXactState->perBackendXactIds;
1872         OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
1873 }
1874
1875 /*
1876  * This func must be called ONCE on system install.  It creates the initial
1877  * MultiXact segments.  (The MultiXacts directories are assumed to have been
1878  * created by initdb, and MultiXactShmemInit must have been called already.)
1879  */
1880 void
1881 BootStrapMultiXact(void)
1882 {
1883         int                     slotno;
1884
1885         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1886
1887         /* Create and zero the first page of the offsets log */
1888         slotno = ZeroMultiXactOffsetPage(0, false);
1889
1890         /* Make sure it's written out */
1891         SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1892         Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
1893
1894         LWLockRelease(MultiXactOffsetControlLock);
1895
1896         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1897
1898         /* Create and zero the first page of the members log */
1899         slotno = ZeroMultiXactMemberPage(0, false);
1900
1901         /* Make sure it's written out */
1902         SimpleLruWritePage(MultiXactMemberCtl, slotno);
1903         Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
1904
1905         LWLockRelease(MultiXactMemberControlLock);
1906 }
1907
1908 /*
1909  * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
1910  * If writeXlog is TRUE, also emit an XLOG record saying we did this.
1911  *
1912  * The page is not actually written, just set up in shared memory.
1913  * The slot number of the new page is returned.
1914  *
1915  * Control lock must be held at entry, and will be held at exit.
1916  */
1917 static int
1918 ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
1919 {
1920         int                     slotno;
1921
1922         slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
1923
1924         if (writeXlog)
1925                 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
1926
1927         return slotno;
1928 }
1929
1930 /*
1931  * Ditto, for MultiXactMember
1932  */
1933 static int
1934 ZeroMultiXactMemberPage(int pageno, bool writeXlog)
1935 {
1936         int                     slotno;
1937
1938         slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
1939
1940         if (writeXlog)
1941                 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
1942
1943         return slotno;
1944 }
1945
1946 /*
1947  * MaybeExtendOffsetSlru
1948  *              Extend the offsets SLRU area, if necessary
1949  *
1950  * After a binary upgrade from <= 9.2, the pg_multixact/offset SLRU area might
1951  * contain files that are shorter than necessary; this would occur if the old
1952  * installation had used multixacts beyond the first page (files cannot be
1953  * copied, because the on-disk representation is different).  pg_upgrade would
1954  * update pg_control to set the next offset value to be at that position, so
1955  * that tuples marked as locked by such MultiXacts would be seen as visible
1956  * without having to consult multixact.  However, trying to create and use a
1957  * new MultiXactId would result in an error because the page on which the new
1958  * value would reside does not exist.  This routine is in charge of creating
1959  * such pages.
1960  */
1961 static void
1962 MaybeExtendOffsetSlru(void)
1963 {
1964         int                     pageno;
1965
1966         pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
1967
1968         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1969
1970         if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
1971         {
1972                 int                     slotno;
1973
1974                 /*
1975                  * Fortunately for us, SimpleLruWritePage is already prepared to deal
1976                  * with creating a new segment file even if the page we're writing is
1977                  * not the first in it, so this is enough.
1978                  */
1979                 slotno = ZeroMultiXactOffsetPage(pageno, false);
1980                 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1981         }
1982
1983         LWLockRelease(MultiXactOffsetControlLock);
1984 }
1985
1986 /*
1987  * This must be called ONCE during postmaster or standalone-backend startup.
1988  *
1989  * StartupXLOG has already established nextMXact/nextOffset by calling
1990  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
1991  * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
1992  * replayed WAL.
1993  */
1994 void
1995 StartupMultiXact(void)
1996 {
1997         MultiXactId multi = MultiXactState->nextMXact;
1998         MultiXactOffset offset = MultiXactState->nextOffset;
1999         int                     pageno;
2000
2001         /*
2002          * Initialize offset's idea of the latest page number.
2003          */
2004         pageno = MultiXactIdToOffsetPage(multi);
2005         MultiXactOffsetCtl->shared->latest_page_number = pageno;
2006
2007         /*
2008          * Initialize member's idea of the latest page number.
2009          */
2010         pageno = MXOffsetToMemberPage(offset);
2011         MultiXactMemberCtl->shared->latest_page_number = pageno;
2012 }
2013
2014 /*
2015  * This must be called ONCE at the end of startup/recovery.
2016  */
2017 void
2018 TrimMultiXact(void)
2019 {
2020         MultiXactId nextMXact;
2021         MultiXactOffset offset;
2022         MultiXactId oldestMXact;
2023         Oid                     oldestMXactDB;
2024         int                     pageno;
2025         int                     entryno;
2026         int                     flagsoff;
2027
2028         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2029         nextMXact = MultiXactState->nextMXact;
2030         offset = MultiXactState->nextOffset;
2031         oldestMXact = MultiXactState->oldestMultiXactId;
2032         oldestMXactDB = MultiXactState->oldestMultiXactDB;
2033         LWLockRelease(MultiXactGenLock);
2034
2035         /* Clean up offsets state */
2036         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2037
2038         /*
2039          * (Re-)Initialize our idea of the latest page number for offsets.
2040          */
2041         pageno = MultiXactIdToOffsetPage(nextMXact);
2042         MultiXactOffsetCtl->shared->latest_page_number = pageno;
2043
2044         /*
2045          * Zero out the remainder of the current offsets page.  See notes in
2046          * TrimCLOG() for motivation.
2047          */
2048         entryno = MultiXactIdToOffsetEntry(nextMXact);
2049         if (entryno != 0)
2050         {
2051                 int                     slotno;
2052                 MultiXactOffset *offptr;
2053
2054                 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2055                 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2056                 offptr += entryno;
2057
2058                 MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2059
2060                 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2061         }
2062
2063         LWLockRelease(MultiXactOffsetControlLock);
2064
2065         /* And the same for members */
2066         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2067
2068         /*
2069          * (Re-)Initialize our idea of the latest page number for members.
2070          */
2071         pageno = MXOffsetToMemberPage(offset);
2072         MultiXactMemberCtl->shared->latest_page_number = pageno;
2073
2074         /*
2075          * Zero out the remainder of the current members page.  See notes in
2076          * TrimCLOG() for motivation.
2077          */
2078         flagsoff = MXOffsetToFlagsOffset(offset);
2079         if (flagsoff != 0)
2080         {
2081                 int                     slotno;
2082                 TransactionId *xidptr;
2083                 int                     memberoff;
2084
2085                 memberoff = MXOffsetToMemberOffset(offset);
2086                 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2087                 xidptr = (TransactionId *)
2088                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2089
2090                 MemSet(xidptr, 0, BLCKSZ - memberoff);
2091
2092                 /*
2093                  * Note: we don't need to zero out the flag bits in the remaining
2094                  * members of the current group, because they are always reset before
2095                  * writing.
2096                  */
2097
2098                 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2099         }
2100
2101         LWLockRelease(MultiXactMemberControlLock);
2102
2103         /* signal that we're officially up */
2104         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2105         MultiXactState->finishedStartup = true;
2106         LWLockRelease(MultiXactGenLock);
2107
2108         /* Now compute how far away the next members wraparound is. */
2109         SetMultiXactIdLimit(oldestMXact, oldestMXactDB);
2110 }
2111
2112 /*
2113  * This must be called ONCE during postmaster or standalone-backend shutdown
2114  */
2115 void
2116 ShutdownMultiXact(void)
2117 {
2118         /* Flush dirty MultiXact pages to disk */
2119         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
2120         SimpleLruFlush(MultiXactOffsetCtl, false);
2121         SimpleLruFlush(MultiXactMemberCtl, false);
2122         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
2123 }
2124
2125 /*
2126  * Get the MultiXact data to save in a checkpoint record
2127  */
2128 void
2129 MultiXactGetCheckptMulti(bool is_shutdown,
2130                                                  MultiXactId *nextMulti,
2131                                                  MultiXactOffset *nextMultiOffset,
2132                                                  MultiXactId *oldestMulti,
2133                                                  Oid *oldestMultiDB)
2134 {
2135         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2136         *nextMulti = MultiXactState->nextMXact;
2137         *nextMultiOffset = MultiXactState->nextOffset;
2138         *oldestMulti = MultiXactState->oldestMultiXactId;
2139         *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2140         LWLockRelease(MultiXactGenLock);
2141
2142         debug_elog6(DEBUG2,
2143                                 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2144                                 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2145 }
2146
2147 /*
2148  * Perform a checkpoint --- either during shutdown, or on-the-fly
2149  */
2150 void
2151 CheckPointMultiXact(void)
2152 {
2153         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2154
2155         /* Flush dirty MultiXact pages to disk */
2156         SimpleLruFlush(MultiXactOffsetCtl, true);
2157         SimpleLruFlush(MultiXactMemberCtl, true);
2158
2159         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2160 }
2161
2162 /*
2163  * Set the next-to-be-assigned MultiXactId and offset
2164  *
2165  * This is used when we can determine the correct next ID/offset exactly
2166  * from a checkpoint record.  Although this is only called during bootstrap
2167  * and XLog replay, we take the lock in case any hot-standby backends are
2168  * examining the values.
2169  */
2170 void
2171 MultiXactSetNextMXact(MultiXactId nextMulti,
2172                                           MultiXactOffset nextMultiOffset)
2173 {
2174         debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2175                                 nextMulti, nextMultiOffset);
2176         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2177         MultiXactState->nextMXact = nextMulti;
2178         MultiXactState->nextOffset = nextMultiOffset;
2179         LWLockRelease(MultiXactGenLock);
2180
2181         /*
2182          * During a binary upgrade, make sure that the offsets SLRU is large
2183          * enough to contain the next value that would be created.
2184          *
2185          * We need to do this pretty early during the first startup in binary
2186          * upgrade mode: before StartupMultiXact() in fact, because this routine
2187          * is called even before that by StartupXLOG().  And we can't do it
2188          * earlier than at this point, because during that first call of this
2189          * routine we determine the MultiXactState->nextMXact value that
2190          * MaybeExtendOffsetSlru needs.
2191          */
2192         if (IsBinaryUpgrade)
2193                 MaybeExtendOffsetSlru();
2194 }
2195
2196 /*
2197  * Determine the last safe MultiXactId to allocate given the currently oldest
2198  * datminmxid (ie, the oldest MultiXactId that might exist in any database
2199  * of our cluster), and the OID of the (or a) database with that value.
2200  */
2201 void
2202 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
2203 {
2204         MultiXactId multiVacLimit;
2205         MultiXactId multiWarnLimit;
2206         MultiXactId multiStopLimit;
2207         MultiXactId multiWrapLimit;
2208         MultiXactId curMulti;
2209         bool            needs_offset_vacuum;
2210
2211         Assert(MultiXactIdIsValid(oldest_datminmxid));
2212
2213         /*
2214          * We pretend that a wrap will happen halfway through the multixact ID
2215          * space, but that's not really true, because multixacts wrap differently
2216          * from transaction IDs.  Note that, separately from any concern about
2217          * multixact IDs wrapping, we must ensure that multixact members do not
2218          * wrap.  Limits for that are set in DetermineSafeOldestOffset, not here.
2219          */
2220         multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2221         if (multiWrapLimit < FirstMultiXactId)
2222                 multiWrapLimit += FirstMultiXactId;
2223
2224         /*
2225          * We'll refuse to continue assigning MultiXactIds once we get within 100
2226          * multi of data loss.
2227          *
2228          * Note: This differs from the magic number used in
2229          * SetTransactionIdLimit() since vacuum itself will never generate new
2230          * multis.  XXX actually it does, if it needs to freeze old multis.
2231          */
2232         multiStopLimit = multiWrapLimit - 100;
2233         if (multiStopLimit < FirstMultiXactId)
2234                 multiStopLimit -= FirstMultiXactId;
2235
2236         /*
2237          * We'll start complaining loudly when we get within 10M multis of the
2238          * stop point.   This is kind of arbitrary, but if you let your gas gauge
2239          * get down to 1% of full, would you be looking for the next gas station?
2240          * We need to be fairly liberal about this number because there are lots
2241          * of scenarios where most transactions are done by automatic clients that
2242          * won't pay attention to warnings. (No, we're not gonna make this
2243          * configurable.  If you know enough to configure it, you know enough to
2244          * not get in this kind of trouble in the first place.)
2245          */
2246         multiWarnLimit = multiStopLimit - 10000000;
2247         if (multiWarnLimit < FirstMultiXactId)
2248                 multiWarnLimit -= FirstMultiXactId;
2249
2250         /*
2251          * We'll start trying to force autovacuums when oldest_datminmxid gets to
2252          * be more than autovacuum_multixact_freeze_max_age mxids old.
2253          *
2254          * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2255          * so that we don't have to worry about dealing with on-the-fly changes in
2256          * its value.  See SetTransactionIdLimit.
2257          */
2258         multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2259         if (multiVacLimit < FirstMultiXactId)
2260                 multiVacLimit += FirstMultiXactId;
2261
2262         /* Grab lock for just long enough to set the new limit values */
2263         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2264         MultiXactState->oldestMultiXactId = oldest_datminmxid;
2265         MultiXactState->oldestMultiXactDB = oldest_datoid;
2266         MultiXactState->multiVacLimit = multiVacLimit;
2267         MultiXactState->multiWarnLimit = multiWarnLimit;
2268         MultiXactState->multiStopLimit = multiStopLimit;
2269         MultiXactState->multiWrapLimit = multiWrapLimit;
2270         curMulti = MultiXactState->nextMXact;
2271         LWLockRelease(MultiXactGenLock);
2272
2273         /* Log the info */
2274         ereport(DEBUG1,
2275          (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
2276                          multiWrapLimit, oldest_datoid)));
2277
2278         /*
2279          * Computing the actual limits is only possible once the data directory is
2280          * in a consistent state. There's no need to compute the limits while
2281          * still replaying WAL - no decisions about new multis are made even
2282          * though multixact creations might be replayed. So we'll only do further
2283          * checks after TrimMultiXact() has been called.
2284          */
2285         if (!MultiXactState->finishedStartup)
2286                 return;
2287
2288         Assert(!InRecovery);
2289
2290         /* Set limits for offset vacuum. */
2291         needs_offset_vacuum = SetOffsetVacuumLimit();
2292
2293         /*
2294          * If past the autovacuum force point, immediately signal an autovac
2295          * request.  The reason for this is that autovac only processes one
2296          * database per invocation.  Once it's finished cleaning up the oldest
2297          * database, it'll call here, and we'll signal the postmaster to start
2298          * another iteration immediately if there are still any old databases.
2299          */
2300         if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2301                  needs_offset_vacuum) && IsUnderPostmaster)
2302                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2303
2304         /* Give an immediate warning if past the wrap warn point */
2305         if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2306         {
2307                 char       *oldest_datname;
2308
2309                 /*
2310                  * We can be called when not inside a transaction, for example during
2311                  * StartupXLOG().  In such a case we cannot do database access, so we
2312                  * must just report the oldest DB's OID.
2313                  *
2314                  * Note: it's also possible that get_database_name fails and returns
2315                  * NULL, for example because the database just got dropped.  We'll
2316                  * still warn, even though the warning might now be unnecessary.
2317                  */
2318                 if (IsTransactionState())
2319                         oldest_datname = get_database_name(oldest_datoid);
2320                 else
2321                         oldest_datname = NULL;
2322
2323                 if (oldest_datname)
2324                         ereport(WARNING,
2325                                         (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2326                                                                    "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2327                                                                    multiWrapLimit - curMulti,
2328                                                                    oldest_datname,
2329                                                                    multiWrapLimit - curMulti),
2330                                          errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2331                                                          "You might also need to commit or roll back old prepared transactions.")));
2332                 else
2333                         ereport(WARNING,
2334                                         (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2335                                                                    "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2336                                                                    multiWrapLimit - curMulti,
2337                                                                    oldest_datoid,
2338                                                                    multiWrapLimit - curMulti),
2339                                          errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2340                                                          "You might also need to commit or roll back old prepared transactions.")));
2341         }
2342 }
2343
2344 /*
2345  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2346  * and similarly nextOffset is at least minMultiOffset.
2347  *
2348  * This is used when we can determine minimum safe values from an XLog
2349  * record (either an on-line checkpoint or an mxact creation log entry).
2350  * Although this is only called during XLog replay, we take the lock in case
2351  * any hot-standby backends are examining the values.
2352  */
2353 void
2354 MultiXactAdvanceNextMXact(MultiXactId minMulti,
2355                                                   MultiXactOffset minMultiOffset)
2356 {
2357         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2358         if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2359         {
2360                 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2361                 MultiXactState->nextMXact = minMulti;
2362         }
2363         if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2364         {
2365                 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2366                                         minMultiOffset);
2367                 MultiXactState->nextOffset = minMultiOffset;
2368         }
2369         LWLockRelease(MultiXactGenLock);
2370 }
2371
2372 /*
2373  * Update our oldestMultiXactId value, but only if it's more recent than what
2374  * we had.
2375  *
2376  * This may only be called during WAL replay.
2377  */
2378 void
2379 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2380 {
2381         Assert(InRecovery);
2382
2383         if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2384         {
2385                 /*
2386                  * If there has been a truncation on the master, detected by seeing a
2387                  * moving oldestMulti, without a corresponding truncation record, we
2388                  * know that the primary is still running an older version of postgres
2389                  * that doesn't yet log multixact truncations. So perform the
2390                  * truncation ourselves.
2391                  */
2392                 if (!MultiXactState->sawTruncationInCkptCycle)
2393                 {
2394                         ereport(LOG,
2395                                         (errmsg("performing legacy multixact truncation"),
2396                                          errdetail("Legacy truncations are sometimes performed when replaying WAL from an older primary."),
2397                                          errhint("Upgrade the primary, it is susceptible to data corruption.")));
2398                         TruncateMultiXact(oldestMulti, oldestMultiDB, true);
2399                 }
2400
2401                 SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
2402         }
2403
2404         /* only looked at in the startup process, no lock necessary */
2405         MultiXactState->sawTruncationInCkptCycle = false;
2406 }
2407
2408 /*
2409  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2410  *
2411  * NB: this is called while holding MultiXactGenLock.  We want it to be very
2412  * fast most of the time; even when it's not so fast, no actual I/O need
2413  * happen unless we're forced to write out a dirty log or xlog page to make
2414  * room in shared memory.
2415  */
2416 static void
2417 ExtendMultiXactOffset(MultiXactId multi)
2418 {
2419         int                     pageno;
2420
2421         /*
2422          * No work except at first MultiXactId of a page.  But beware: just after
2423          * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2424          */
2425         if (MultiXactIdToOffsetEntry(multi) != 0 &&
2426                 multi != FirstMultiXactId)
2427                 return;
2428
2429         pageno = MultiXactIdToOffsetPage(multi);
2430
2431         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2432
2433         /* Zero the page and make an XLOG entry about it */
2434         ZeroMultiXactOffsetPage(pageno, true);
2435
2436         LWLockRelease(MultiXactOffsetControlLock);
2437 }
2438
2439 /*
2440  * Make sure that MultiXactMember has room for the members of a newly-
2441  * allocated MultiXactId.
2442  *
2443  * Like the above routine, this is called while holding MultiXactGenLock;
2444  * same comments apply.
2445  */
2446 static void
2447 ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2448 {
2449         /*
2450          * It's possible that the members span more than one page of the members
2451          * file, so we loop to ensure we consider each page.  The coding is not
2452          * optimal if the members span several pages, but that seems unusual
2453          * enough to not worry much about.
2454          */
2455         while (nmembers > 0)
2456         {
2457                 int                     flagsoff;
2458                 int                     flagsbit;
2459                 uint32          difference;
2460
2461                 /*
2462                  * Only zero when at first entry of a page.
2463                  */
2464                 flagsoff = MXOffsetToFlagsOffset(offset);
2465                 flagsbit = MXOffsetToFlagsBitShift(offset);
2466                 if (flagsoff == 0 && flagsbit == 0)
2467                 {
2468                         int                     pageno;
2469
2470                         pageno = MXOffsetToMemberPage(offset);
2471
2472                         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2473
2474                         /* Zero the page and make an XLOG entry about it */
2475                         ZeroMultiXactMemberPage(pageno, true);
2476
2477                         LWLockRelease(MultiXactMemberControlLock);
2478                 }
2479
2480                 /*
2481                  * Compute the number of items till end of current page.  Careful: if
2482                  * addition of unsigned ints wraps around, we're at the last page of
2483                  * the last segment; since that page holds a different number of items
2484                  * than other pages, we need to do it differently.
2485                  */
2486                 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2487                 {
2488                         /*
2489                          * This is the last page of the last segment; we can compute the
2490                          * number of items left to allocate in it without modulo
2491                          * arithmetic.
2492                          */
2493                         difference = MaxMultiXactOffset - offset + 1;
2494                 }
2495                 else
2496                         difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2497
2498                 /*
2499                  * Advance to next page, taking care to properly handle the wraparound
2500                  * case.  OK if nmembers goes negative.
2501                  */
2502                 nmembers -= difference;
2503                 offset += difference;
2504         }
2505 }
2506
2507 /*
2508  * GetOldestMultiXactId
2509  *
2510  * Return the oldest MultiXactId that's still possibly still seen as live by
2511  * any running transaction.  Older ones might still exist on disk, but they no
2512  * longer have any running member transaction.
2513  *
2514  * It's not safe to truncate MultiXact SLRU segments on the value returned by
2515  * this function; however, it can be used by a full-table vacuum to set the
2516  * point at which it will be possible to truncate SLRU for that table.
2517  */
2518 MultiXactId
2519 GetOldestMultiXactId(void)
2520 {
2521         MultiXactId oldestMXact;
2522         MultiXactId nextMXact;
2523         int                     i;
2524
2525         /*
2526          * This is the oldest valid value among all the OldestMemberMXactId[] and
2527          * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2528          */
2529         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2530
2531         /*
2532          * We have to beware of the possibility that nextMXact is in the
2533          * wrapped-around state.  We don't fix the counter itself here, but we
2534          * must be sure to use a valid value in our calculation.
2535          */
2536         nextMXact = MultiXactState->nextMXact;
2537         if (nextMXact < FirstMultiXactId)
2538                 nextMXact = FirstMultiXactId;
2539
2540         oldestMXact = nextMXact;
2541         for (i = 1; i <= MaxOldestSlot; i++)
2542         {
2543                 MultiXactId thisoldest;
2544
2545                 thisoldest = OldestMemberMXactId[i];
2546                 if (MultiXactIdIsValid(thisoldest) &&
2547                         MultiXactIdPrecedes(thisoldest, oldestMXact))
2548                         oldestMXact = thisoldest;
2549                 thisoldest = OldestVisibleMXactId[i];
2550                 if (MultiXactIdIsValid(thisoldest) &&
2551                         MultiXactIdPrecedes(thisoldest, oldestMXact))
2552                         oldestMXact = thisoldest;
2553         }
2554
2555         LWLockRelease(MultiXactGenLock);
2556
2557         return oldestMXact;
2558 }
2559
2560 /*
2561  * Determine how aggressively we need to vacuum in order to prevent member
2562  * wraparound.
2563  *
2564  * To do so determine what's the oldest member offset and install the limit
2565  * info in MultiXactState, where it can be used to prevent overrun of old data
2566  * in the members SLRU area.
2567  *
2568  * The return value is true if emergency autovacuum is required and false
2569  * otherwise.
2570  */
2571 static bool
2572 SetOffsetVacuumLimit(void)
2573 {
2574         MultiXactId oldestMultiXactId;
2575         MultiXactId nextMXact;
2576         MultiXactOffset oldestOffset = 0;       /* placate compiler */
2577         MultiXactOffset prevOldestOffset;
2578         MultiXactOffset nextOffset;
2579         bool            oldestOffsetKnown = false;
2580         bool            prevOldestOffsetKnown;
2581         MultiXactOffset offsetStopLimit = 0;
2582
2583         /*
2584          * NB: Have to prevent concurrent truncation, we might otherwise try to
2585          * lookup a oldestMulti that's concurrently getting truncated away.
2586          */
2587         LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2588
2589         /* Read relevant fields from shared memory. */
2590         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2591         oldestMultiXactId = MultiXactState->oldestMultiXactId;
2592         nextMXact = MultiXactState->nextMXact;
2593         nextOffset = MultiXactState->nextOffset;
2594         prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2595         prevOldestOffset = MultiXactState->oldestOffset;
2596         Assert(MultiXactState->finishedStartup);
2597         LWLockRelease(MultiXactGenLock);
2598
2599         /*
2600          * Determine the offset of the oldest multixact.  Normally, we can read
2601          * the offset from the multixact itself, but there's an important special
2602          * case: if there are no multixacts in existence at all, oldestMXact
2603          * obviously can't point to one.  It will instead point to the multixact
2604          * ID that will be assigned the next time one is needed.
2605          */
2606         if (oldestMultiXactId == nextMXact)
2607         {
2608                 /*
2609                  * When the next multixact gets created, it will be stored at the next
2610                  * offset.
2611                  */
2612                 oldestOffset = nextOffset;
2613                 oldestOffsetKnown = true;
2614         }
2615         else
2616         {
2617                 /*
2618                  * Figure out where the oldest existing multixact's offsets are
2619                  * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2620                  * the supposedly-earliest multixact might not really exist.  We are
2621                  * careful not to fail in that case.
2622                  */
2623                 oldestOffsetKnown =
2624                         find_multixact_start(oldestMultiXactId, &oldestOffset);
2625
2626                 if (oldestOffsetKnown)
2627                         ereport(DEBUG1,
2628                                         (errmsg("oldest MultiXactId member is at offset %u",
2629                                                         oldestOffset)));
2630                 else
2631                         ereport(LOG,
2632                                         (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2633                                                         oldestMultiXactId)));
2634         }
2635
2636         LWLockRelease(MultiXactTruncationLock);
2637
2638         /*
2639          * If we can, compute limits (and install them MultiXactState) to prevent
2640          * overrun of old data in the members SLRU area. We can only do so if the
2641          * oldest offset is known though.
2642          */
2643         if (oldestOffsetKnown)
2644         {
2645                 /* move back to start of the corresponding segment */
2646                 offsetStopLimit = oldestOffset - (oldestOffset %
2647                                           (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2648
2649                 /* always leave one segment before the wraparound point */
2650                 offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2651
2652                 if (!prevOldestOffsetKnown && IsUnderPostmaster)
2653                         ereport(LOG,
2654                                         (errmsg("MultiXact member wraparound protections are now enabled")));
2655                 ereport(DEBUG1,
2656                 (errmsg("MultiXact member stop limit is now %u based on MultiXact %u",
2657                                 offsetStopLimit, oldestMultiXactId)));
2658         }
2659         else if (prevOldestOffsetKnown)
2660         {
2661                 /*
2662                  * If we failed to get the oldest offset this time, but we have a
2663                  * value from a previous pass through this function, use the old value
2664                  * rather than automatically forcing it.
2665                  */
2666                 oldestOffset = prevOldestOffset;
2667                 oldestOffsetKnown = true;
2668         }
2669
2670         /* Install the computed values */
2671         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2672         MultiXactState->oldestOffset = oldestOffset;
2673         MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2674         MultiXactState->offsetStopLimit = offsetStopLimit;
2675         LWLockRelease(MultiXactGenLock);
2676
2677         /*
2678          * Do we need an emergency autovacuum?  If we're not sure, assume yes.
2679          */
2680         return !oldestOffsetKnown ||
2681                 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2682 }
2683
2684 /*
2685  * Return whether adding "distance" to "start" would move past "boundary".
2686  *
2687  * We use this to determine whether the addition is "wrapping around" the
2688  * boundary point, hence the name.  The reason we don't want to use the regular
2689  * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2690  * the 2^32-1 space here, allowing for more multixacts that would fit
2691  * otherwise.
2692  */
2693 static bool
2694 MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2695                                                  uint32 distance)
2696 {
2697         MultiXactOffset finish;
2698
2699         /*
2700          * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2701          * if the addition wraps around the UINT_MAX boundary, skip that value.
2702          */
2703         finish = start + distance;
2704         if (finish < start)
2705                 finish++;
2706
2707         /*-----------------------------------------------------------------------
2708          * When the boundary is numerically greater than the starting point, any
2709          * value numerically between the two is not wrapped:
2710          *
2711          *      <----S----B---->
2712          *      [---)                    = F wrapped past B (and UINT_MAX)
2713          *               [---)           = F not wrapped
2714          *                        [----] = F wrapped past B
2715          *
2716          * When the boundary is numerically less than the starting point (i.e. the
2717          * UINT_MAX wraparound occurs somewhere in between) then all values in
2718          * between are wrapped:
2719          *
2720          *      <----B----S---->
2721          *      [---)                    = F not wrapped past B (but wrapped past UINT_MAX)
2722          *               [---)           = F wrapped past B (and UINT_MAX)
2723          *                        [----] = F not wrapped
2724          *-----------------------------------------------------------------------
2725          */
2726         if (start < boundary)
2727                 return finish >= boundary || finish < start;
2728         else
2729                 return finish >= boundary && finish < start;
2730 }
2731
2732 /*
2733  * Find the starting offset of the given MultiXactId.
2734  *
2735  * Returns false if the file containing the multi does not exist on disk.
2736  * Otherwise, returns true and sets *result to the starting member offset.
2737  *
2738  * This function does not prevent concurrent truncation, so if that's
2739  * required, the caller has to protect against that.
2740  */
2741 static bool
2742 find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2743 {
2744         MultiXactOffset offset;
2745         int                     pageno;
2746         int                     entryno;
2747         int                     slotno;
2748         MultiXactOffset *offptr;
2749
2750         /* XXX: Remove || AmStartupProcess() after WAL page magic bump */
2751         Assert(MultiXactState->finishedStartup || AmStartupProcess());
2752
2753         pageno = MultiXactIdToOffsetPage(multi);
2754         entryno = MultiXactIdToOffsetEntry(multi);
2755
2756         /*
2757          * Flush out dirty data, so PhysicalPageExists can work correctly.
2758          * SimpleLruFlush() is a pretty big hammer for that.  Alternatively we
2759          * could add a in-memory version of page exists, but find_multixact_start
2760          * is called infrequently, and it doesn't seem bad to flush buffers to
2761          * disk before truncation.
2762          */
2763         SimpleLruFlush(MultiXactOffsetCtl, true);
2764         SimpleLruFlush(MultiXactMemberCtl, true);
2765
2766         if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2767                 return false;
2768
2769         /* lock is acquired by SimpleLruReadPage_ReadOnly */
2770         slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2771         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2772         offptr += entryno;
2773         offset = *offptr;
2774         LWLockRelease(MultiXactOffsetControlLock);
2775
2776         *result = offset;
2777         return true;
2778 }
2779
2780 /*
2781  * Determine how many multixacts, and how many multixact members, currently
2782  * exist.  Return false if unable to determine.
2783  */
2784 static bool
2785 ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
2786 {
2787         MultiXactOffset nextOffset;
2788         MultiXactOffset oldestOffset;
2789         MultiXactId oldestMultiXactId;
2790         MultiXactId nextMultiXactId;
2791         bool            oldestOffsetKnown;
2792
2793         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2794         nextOffset = MultiXactState->nextOffset;
2795         oldestMultiXactId = MultiXactState->oldestMultiXactId;
2796         nextMultiXactId = MultiXactState->nextMXact;
2797         oldestOffset = MultiXactState->oldestOffset;
2798         oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2799         LWLockRelease(MultiXactGenLock);
2800
2801         if (!oldestOffsetKnown)
2802                 return false;
2803
2804         *members = nextOffset - oldestOffset;
2805         *multixacts = nextMultiXactId - oldestMultiXactId;
2806         return true;
2807 }
2808
2809 /*
2810  * Multixact members can be removed once the multixacts that refer to them
2811  * are older than every datminxmid.  autovacuum_multixact_freeze_max_age and
2812  * vacuum_multixact_freeze_table_age work together to make sure we never have
2813  * too many multixacts; we hope that, at least under normal circumstances,
2814  * this will also be sufficient to keep us from using too many offsets.
2815  * However, if the average multixact has many members, we might exhaust the
2816  * members space while still using few enough members that these limits fail
2817  * to trigger full table scans for relminmxid advancement.  At that point,
2818  * we'd have no choice but to start failing multixact-creating operations
2819  * with an error.
2820  *
2821  * To prevent that, if more than a threshold portion of the members space is
2822  * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2823  * to a value just less than the number of multixacts in use.  We hope that
2824  * this will quickly trigger autovacuuming on the table or tables with the
2825  * oldest relminmxid, thus allowing datminmxid values to advance and removing
2826  * some members.
2827  *
2828  * As the fraction of the member space currently in use grows, we become
2829  * more aggressive in clamping this value.  That not only causes autovacuum
2830  * to ramp up, but also makes any manual vacuums the user issues more
2831  * aggressive.  This happens because vacuum_set_xid_limits() clamps the
2832  * freeze table and and the minimum freeze age based on the effective
2833  * autovacuum_multixact_freeze_max_age this function returns.  In the worst
2834  * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2835  * table will try to freeze every multixact.
2836  *
2837  * It's possible that these thresholds should be user-tunable, but for now
2838  * we keep it simple.
2839  */
2840 int
2841 MultiXactMemberFreezeThreshold(void)
2842 {
2843         MultiXactOffset members;
2844         uint32          multixacts;
2845         uint32          victim_multixacts;
2846         double          fraction;
2847
2848         /* If we can't determine member space utilization, assume the worst. */
2849         if (!ReadMultiXactCounts(&multixacts, &members))
2850                 return 0;
2851
2852         /* If member space utilization is low, no special action is required. */
2853         if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2854                 return autovacuum_multixact_freeze_max_age;
2855
2856         /*
2857          * Compute a target for relminmxid advancement.  The number of multixacts
2858          * we try to eliminate from the system is based on how far we are past
2859          * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2860          */
2861         fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2862                 (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
2863         victim_multixacts = multixacts * fraction;
2864
2865         /* fraction could be > 1.0, but lowest possible freeze age is zero */
2866         if (victim_multixacts > multixacts)
2867                 return 0;
2868         return multixacts - victim_multixacts;
2869 }
2870
2871 typedef struct mxtruncinfo
2872 {
2873         int                     earliestExistingPage;
2874 } mxtruncinfo;
2875
2876 /*
2877  * SlruScanDirectory callback
2878  *              This callback determines the earliest existing page number.
2879  */
2880 static bool
2881 SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
2882 {
2883         mxtruncinfo *trunc = (mxtruncinfo *) data;
2884
2885         if (trunc->earliestExistingPage == -1 ||
2886                 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2887         {
2888                 trunc->earliestExistingPage = segpage;
2889         }
2890
2891         return false;                           /* keep going */
2892 }
2893
2894
2895 /*
2896  * Delete members segments [oldest, newOldest)
2897  *
2898  * The members SLRU can, in contrast to the offsets one, be filled to almost
2899  * the full range at once. This means SimpleLruTruncate() can't trivially be
2900  * used - instead the to-be-deleted range is computed using the offsets
2901  * SLRU. C.f. TruncateMultiXact().
2902  */
2903 static void
2904 PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
2905 {
2906         const int       maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
2907         int                     startsegment = MXOffsetToMemberSegment(oldestOffset);
2908         int                     endsegment = MXOffsetToMemberSegment(newOldestOffset);
2909         int                     segment = startsegment;
2910
2911         /*
2912          * Delete all the segments but the last one. The last segment can still
2913          * contain, possibly partially, valid data.
2914          */
2915         while (segment != endsegment)
2916         {
2917                 elog(DEBUG2, "truncating multixact members segment %x", segment);
2918                 SlruDeleteSegment(MultiXactMemberCtl, segment);
2919
2920                 /* move to next segment, handling wraparound correctly */
2921                 if (segment == maxsegment)
2922                         segment = 0;
2923                 else
2924                         segment += 1;
2925         }
2926 }
2927
2928 /*
2929  * Delete offsets segments [oldest, newOldest)
2930  */
2931 static void
2932 PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
2933 {
2934         /*
2935          * We step back one multixact to avoid passing a cutoff page that hasn't
2936          * been created yet in the rare case that oldestMulti would be the first
2937          * item on a page and oldestMulti == nextMulti.  In that case, if we
2938          * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
2939          * detection.
2940          */
2941         SimpleLruTruncate(MultiXactOffsetCtl,
2942                            MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
2943 }
2944
2945 /*
2946  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
2947  * ones still of interest.
2948  *
2949  * On a primary this is called as part of vacuum (via
2950  * vac_truncate_clog()). During recovery truncation is normally done by
2951  * replaying truncation WAL records instead of this routine; the exception is
2952  * when replaying records from an older primary that doesn't yet generate
2953  * truncation WAL records. In that case truncation is triggered by
2954  * MultiXactAdvanceOldest().
2955  *
2956  * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
2957  * is one of the databases preventing newOldestMulti from increasing.
2958  */
2959 void
2960 TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB, bool in_recovery)
2961 {
2962         MultiXactId oldestMulti;
2963         MultiXactId nextMulti;
2964         MultiXactOffset newOldestOffset;
2965         MultiXactOffset oldestOffset;
2966         MultiXactOffset nextOffset;
2967         mxtruncinfo trunc;
2968         MultiXactId earliest;
2969
2970         /*
2971          * Need to allow being called in recovery for backwards compatibility,
2972          * when an updated standby replays WAL generated by a non-updated primary.
2973          */
2974         Assert(in_recovery || !RecoveryInProgress());
2975         Assert(!in_recovery || AmStartupProcess());
2976         Assert(in_recovery || MultiXactState->finishedStartup);
2977
2978         /*
2979          * We can only allow one truncation to happen at once. Otherwise parts of
2980          * members might vanish while we're doing lookups or similar. There's no
2981          * need to have an interlock with creating new multis or such, since those
2982          * are constrained by the limits (which only grow, never shrink).
2983          */
2984         LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2985
2986         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2987         nextMulti = MultiXactState->nextMXact;
2988         nextOffset = MultiXactState->nextOffset;
2989         oldestMulti = MultiXactState->oldestMultiXactId;
2990         LWLockRelease(MultiXactGenLock);
2991         Assert(MultiXactIdIsValid(oldestMulti));
2992
2993         /*
2994          * Make sure to only attempt truncation if there's values to truncate
2995          * away. In normal processing values shouldn't go backwards, but there's
2996          * some corner cases (due to bugs) where that's possible.
2997          */
2998         if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
2999         {
3000                 LWLockRelease(MultiXactTruncationLock);
3001                 return;
3002         }
3003
3004         /*
3005          * Note we can't just plow ahead with the truncation; it's possible that
3006          * there are no segments to truncate, which is a problem because we are
3007          * going to attempt to read the offsets page to determine where to
3008          * truncate the members SLRU.  So we first scan the directory to determine
3009          * the earliest offsets page number that we can read without error.
3010          *
3011          * NB: It's also possible that the page that oldestMulti is on has already
3012          * been truncated away, and we crashed before updating oldestMulti.
3013          */
3014         trunc.earliestExistingPage = -1;
3015         SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
3016         earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
3017         if (earliest < FirstMultiXactId)
3018                 earliest = FirstMultiXactId;
3019
3020         /* If there's nothing to remove, we can bail out early. */
3021         if (MultiXactIdPrecedes(oldestMulti, earliest))
3022         {
3023                 LWLockRelease(MultiXactTruncationLock);
3024                 return;
3025         }
3026
3027         /*
3028          * First, compute the safe truncation point for MultiXactMember. This is
3029          * the starting offset of the oldest multixact.
3030          *
3031          * Hopefully, find_multixact_start will always work here, because we've
3032          * already checked that it doesn't precede the earliest MultiXact on disk.
3033          * But if it fails, don't truncate anything, and log a message.
3034          */
3035         if (oldestMulti == nextMulti)
3036         {
3037                 /* there are NO MultiXacts */
3038                 oldestOffset = nextOffset;
3039         }
3040         else if (!find_multixact_start(oldestMulti, &oldestOffset))
3041         {
3042                 ereport(LOG,
3043                                 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3044                                                 oldestMulti, earliest)));
3045                 LWLockRelease(MultiXactTruncationLock);
3046                 return;
3047         }
3048
3049         /*
3050          * Secondly compute up to where to truncate. Lookup the corresponding
3051          * member offset for newOldestMulti for that.
3052          */
3053         if (newOldestMulti == nextMulti)
3054         {
3055                 /* there are NO MultiXacts */
3056                 newOldestOffset = nextOffset;
3057         }
3058         else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3059         {
3060                 ereport(LOG,
3061                                 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3062                                                 newOldestMulti)));
3063                 LWLockRelease(MultiXactTruncationLock);
3064                 return;
3065         }
3066
3067         elog(DEBUG1, "performing multixact truncation: "
3068                  "offsets [%u, %u), offsets segments [%x, %x), "
3069                  "members [%u, %u), members segments [%x, %x)",
3070                  oldestMulti, newOldestMulti,
3071                  MultiXactIdToOffsetSegment(oldestMulti),
3072                  MultiXactIdToOffsetSegment(newOldestMulti),
3073                  oldestOffset, newOldestOffset,
3074                  MXOffsetToMemberSegment(oldestOffset),
3075                  MXOffsetToMemberSegment(newOldestOffset));
3076
3077         /*
3078          * Do truncation, and the WAL logging of the truncation, in a critical
3079          * section. That way offsets/members cannot get out of sync anymore, i.e.
3080          * once consistent the newOldestMulti will always exist in members, even
3081          * if we crashed in the wrong moment.
3082          */
3083         START_CRIT_SECTION();
3084
3085         /*
3086          * Prevent checkpoints from being scheduled concurrently. This is critical
3087          * because otherwise a truncation record might not be replayed after a
3088          * crash/basebackup, even though the state of the data directory would
3089          * require it.  It's not possible (startup process doesn't have a PGXACT
3090          * entry), and not needed, to do this during recovery, when performing an
3091          * old-style truncation, though. There the entire scheduling depends on
3092          * the replayed WAL records which be the same after a possible crash.
3093          */
3094         if (!in_recovery)
3095         {
3096                 Assert(!MyPgXact->delayChkpt);
3097                 MyPgXact->delayChkpt = true;
3098         }
3099
3100         /* WAL log truncation */
3101         if (!in_recovery)
3102                 WriteMTruncateXlogRec(newOldestMultiDB,
3103                                                           oldestMulti, newOldestMulti,
3104                                                           oldestOffset, newOldestOffset);
3105
3106         /*
3107          * Update in-memory limits before performing the truncation, while inside
3108          * the critical section: Have to do it before truncation, to prevent
3109          * concurrent lookups of those values. Has to be inside the critical
3110          * section as otherwise a future call to this function would error out,
3111          * while looking up the oldest member in offsets, if our caller crashes
3112          * before updating the limits.
3113          */
3114         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3115         MultiXactState->oldestMultiXactId = newOldestMulti;
3116         MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3117         LWLockRelease(MultiXactGenLock);
3118
3119         /* First truncate members */
3120         PerformMembersTruncation(oldestOffset, newOldestOffset);
3121
3122         /* Then offsets */
3123         PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3124
3125         if (!in_recovery)
3126                 MyPgXact->delayChkpt = false;
3127
3128         END_CRIT_SECTION();
3129         LWLockRelease(MultiXactTruncationLock);
3130 }
3131
3132 /*
3133  * Decide which of two MultiXactOffset page numbers is "older" for truncation
3134  * purposes.
3135  *
3136  * We need to use comparison of MultiXactId here in order to do the right
3137  * thing with wraparound.  However, if we are asked about page number zero, we
3138  * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
3139  * weird.  So, offset both multis by FirstMultiXactId to avoid that.
3140  * (Actually, the current implementation doesn't do anything weird with
3141  * InvalidMultiXactId, but there's no harm in leaving this code like this.)
3142  */
3143 static bool
3144 MultiXactOffsetPagePrecedes(int page1, int page2)
3145 {
3146         MultiXactId multi1;
3147         MultiXactId multi2;
3148
3149         multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3150         multi1 += FirstMultiXactId;
3151         multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3152         multi2 += FirstMultiXactId;
3153
3154         return MultiXactIdPrecedes(multi1, multi2);
3155 }
3156
3157 /*
3158  * Decide which of two MultiXactMember page numbers is "older" for truncation
3159  * purposes.  There is no "invalid offset number" so use the numbers verbatim.
3160  */
3161 static bool
3162 MultiXactMemberPagePrecedes(int page1, int page2)
3163 {
3164         MultiXactOffset offset1;
3165         MultiXactOffset offset2;
3166
3167         offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3168         offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3169
3170         return MultiXactOffsetPrecedes(offset1, offset2);
3171 }
3172
3173 /*
3174  * Decide which of two MultiXactIds is earlier.
3175  *
3176  * XXX do we need to do something special for InvalidMultiXactId?
3177  * (Doesn't look like it.)
3178  */
3179 bool
3180 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3181 {
3182         int32           diff = (int32) (multi1 - multi2);
3183
3184         return (diff < 0);
3185 }
3186
3187 /*
3188  * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3189  *
3190  * XXX do we need to do something special for InvalidMultiXactId?
3191  * (Doesn't look like it.)
3192  */
3193 bool
3194 MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3195 {
3196         int32           diff = (int32) (multi1 - multi2);
3197
3198         return (diff <= 0);
3199 }
3200
3201
3202 /*
3203  * Decide which of two offsets is earlier.
3204  */
3205 static bool
3206 MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3207 {
3208         int32           diff = (int32) (offset1 - offset2);
3209
3210         return (diff < 0);
3211 }
3212
3213 /*
3214  * Write an xlog record reflecting the zeroing of either a MEMBERs or
3215  * OFFSETs page (info shows which)
3216  */
3217 static void
3218 WriteMZeroPageXlogRec(int pageno, uint8 info)
3219 {
3220         XLogBeginInsert();
3221         XLogRegisterData((char *) (&pageno), sizeof(int));
3222         (void) XLogInsert(RM_MULTIXACT_ID, info);
3223 }
3224
3225 /*
3226  * Write a TRUNCATE xlog record
3227  *
3228  * We must flush the xlog record to disk before returning --- see notes in
3229  * TruncateCLOG().
3230  */
3231 static void
3232 WriteMTruncateXlogRec(Oid oldestMultiDB,
3233                                           MultiXactId startTruncOff, MultiXactId endTruncOff,
3234                                 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3235 {
3236         XLogRecPtr      recptr;
3237         xl_multixact_truncate xlrec;
3238
3239         xlrec.oldestMultiDB = oldestMultiDB;
3240
3241         xlrec.startTruncOff = startTruncOff;
3242         xlrec.endTruncOff = endTruncOff;
3243
3244         xlrec.startTruncMemb = startTruncMemb;
3245         xlrec.endTruncMemb = endTruncMemb;
3246
3247         XLogBeginInsert();
3248         XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3249         recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3250         XLogFlush(recptr);
3251 }
3252
3253 /*
3254  * MULTIXACT resource manager's routines
3255  */
3256 void
3257 multixact_redo(XLogReaderState *record)
3258 {
3259         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3260
3261         /* Backup blocks are not used in multixact records */
3262         Assert(!XLogRecHasAnyBlockRefs(record));
3263
3264         if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3265         {
3266                 int                     pageno;
3267                 int                     slotno;
3268
3269                 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3270
3271                 LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
3272
3273                 slotno = ZeroMultiXactOffsetPage(pageno, false);
3274                 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3275                 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3276
3277                 LWLockRelease(MultiXactOffsetControlLock);
3278         }
3279         else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3280         {
3281                 int                     pageno;
3282                 int                     slotno;
3283
3284                 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3285
3286                 LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
3287
3288                 slotno = ZeroMultiXactMemberPage(pageno, false);
3289                 SimpleLruWritePage(MultiXactMemberCtl, slotno);
3290                 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3291
3292                 LWLockRelease(MultiXactMemberControlLock);
3293         }
3294         else if (info == XLOG_MULTIXACT_CREATE_ID)
3295         {
3296                 xl_multixact_create *xlrec =
3297                 (xl_multixact_create *) XLogRecGetData(record);
3298                 TransactionId max_xid;
3299                 int                     i;
3300
3301                 /* Store the data back into the SLRU files */
3302                 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3303                                                    xlrec->members);
3304
3305                 /* Make sure nextMXact/nextOffset are beyond what this record has */
3306                 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3307                                                                   xlrec->moff + xlrec->nmembers);
3308
3309                 /*
3310                  * Make sure nextXid is beyond any XID mentioned in the record. This
3311                  * should be unnecessary, since any XID found here ought to have other
3312                  * evidence in the XLOG, but let's be safe.
3313                  */
3314                 max_xid = XLogRecGetXid(record);
3315                 for (i = 0; i < xlrec->nmembers; i++)
3316                 {
3317                         if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3318                                 max_xid = xlrec->members[i].xid;
3319                 }
3320
3321                 /*
3322                  * We don't expect anyone else to modify nextXid, hence startup
3323                  * process doesn't need to hold a lock while checking this. We still
3324                  * acquire the lock to modify it, though.
3325                  */
3326                 if (TransactionIdFollowsOrEquals(max_xid,
3327                                                                                  ShmemVariableCache->nextXid))
3328                 {
3329                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
3330                         ShmemVariableCache->nextXid = max_xid;
3331                         TransactionIdAdvance(ShmemVariableCache->nextXid);
3332                         LWLockRelease(XidGenLock);
3333                 }
3334         }
3335         else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3336         {
3337                 xl_multixact_truncate xlrec;
3338                 int                     pageno;
3339
3340                 memcpy(&xlrec, XLogRecGetData(record),
3341                            SizeOfMultiXactTruncate);
3342
3343                 elog(DEBUG1, "replaying multixact truncation: "
3344                          "offsets [%u, %u), offsets segments [%x, %x), "
3345                          "members [%u, %u), members segments [%x, %x)",
3346                          xlrec.startTruncOff, xlrec.endTruncOff,
3347                          MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3348                          MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3349                          xlrec.startTruncMemb, xlrec.endTruncMemb,
3350                          MXOffsetToMemberSegment(xlrec.startTruncMemb),
3351                          MXOffsetToMemberSegment(xlrec.endTruncMemb));
3352
3353                 /* should not be required, but more than cheap enough */
3354                 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3355
3356                 /*
3357                  * Advance the horizon values, so they're current at the end of
3358                  * recovery.
3359                  */
3360                 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB);
3361
3362                 PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3363
3364                 /*
3365                  * During XLOG replay, latest_page_number isn't necessarily set up
3366                  * yet; insert a suitable value to bypass the sanity test in
3367                  * SimpleLruTruncate.
3368                  */
3369                 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3370                 MultiXactOffsetCtl->shared->latest_page_number = pageno;
3371                 PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3372
3373                 LWLockRelease(MultiXactTruncationLock);
3374
3375                 /* only looked at in the startup process, no lock necessary */
3376                 MultiXactState->sawTruncationInCkptCycle = true;
3377         }
3378         else
3379                 elog(PANIC, "multixact_redo: unknown op code %u", info);
3380 }
3381
3382 Datum
3383 pg_get_multixact_members(PG_FUNCTION_ARGS)
3384 {
3385         typedef struct
3386         {
3387                 MultiXactMember *members;
3388                 int                     nmembers;
3389                 int                     iter;
3390         } mxact;
3391         MultiXactId mxid = PG_GETARG_UINT32(0);
3392         mxact      *multi;
3393         FuncCallContext *funccxt;
3394
3395         if (mxid < FirstMultiXactId)
3396                 ereport(ERROR,
3397                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3398                                  errmsg("invalid MultiXactId: %u", mxid)));
3399
3400         if (SRF_IS_FIRSTCALL())
3401         {
3402                 MemoryContext oldcxt;
3403                 TupleDesc       tupdesc;
3404
3405                 funccxt = SRF_FIRSTCALL_INIT();
3406                 oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3407
3408                 multi = palloc(sizeof(mxact));
3409                 /* no need to allow for old values here */
3410                 multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3411                                                                                                 false);
3412                 multi->iter = 0;
3413
3414                 tupdesc = CreateTemplateTupleDesc(2, false);
3415                 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
3416                                                    XIDOID, -1, 0);
3417                 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
3418                                                    TEXTOID, -1, 0);
3419
3420                 funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3421                 funccxt->user_fctx = multi;
3422
3423                 MemoryContextSwitchTo(oldcxt);
3424         }
3425
3426         funccxt = SRF_PERCALL_SETUP();
3427         multi = (mxact *) funccxt->user_fctx;
3428
3429         while (multi->iter < multi->nmembers)
3430         {
3431                 HeapTuple       tuple;
3432                 char       *values[2];
3433
3434                 values[0] = psprintf("%u", multi->members[multi->iter].xid);
3435                 values[1] = mxstatus_to_string(multi->members[multi->iter].status);
3436
3437                 tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3438
3439                 multi->iter++;
3440                 pfree(values[0]);
3441                 SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3442         }
3443
3444         if (multi->nmembers > 0)
3445                 pfree(multi->members);
3446         pfree(multi);
3447
3448         SRF_RETURN_DONE(funccxt);
3449 }