From f5b2f60bd1084e218358adba04604147e5429233 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 8 Jun 2005 15:50:28 +0000 Subject: [PATCH] Change WAL-logging scheme for multixacts to be more like regular transaction IDs, rather than like subtrans; in particular, the information now survives a database restart. Per previous discussion, this is essential for PITR log shipping and for 2PC. --- doc/src/sgml/ref/pg_resetxlog.sgml | 28 +- src/backend/access/heap/heapam.c | 24 +- src/backend/access/transam/multixact.c | 621 ++++++++++++++++-------- src/backend/access/transam/rmgr.c | 5 +- src/backend/access/transam/xlog.c | 61 +-- src/bin/pg_controldata/pg_controldata.c | 3 +- src/bin/pg_resetxlog/pg_resetxlog.c | 28 +- src/include/access/htup.h | 4 +- src/include/access/multixact.h | 37 +- src/include/access/xlog.h | 3 +- src/include/c.h | 4 +- src/include/catalog/pg_control.h | 6 +- 12 files changed, 555 insertions(+), 269 deletions(-) diff --git a/doc/src/sgml/ref/pg_resetxlog.sgml b/doc/src/sgml/ref/pg_resetxlog.sgml index f5915adacd..f4caa8b80e 100644 --- a/doc/src/sgml/ref/pg_resetxlog.sgml +++ b/doc/src/sgml/ref/pg_resetxlog.sgml @@ -1,5 +1,5 @@ @@ -23,6 +23,7 @@ PostgreSQL documentation -o oid -x xid -m mxid + -O mxoff -l timelineid,fileid,seg datadir @@ -32,8 +33,8 @@ PostgreSQL documentation Description pg_resetxlog clears the write-ahead log (WAL) and - optionally resets some other control information (stored in the - pg_control file). This function is sometimes needed + optionally resets some other control information stored in the + pg_control file. This function is sometimes needed if these files have become corrupted. It should be used only as a last resort, when the server will not start due to such corruption. @@ -60,8 +61,9 @@ PostgreSQL documentation by specifying the -f (force) switch. In this case plausible values will be substituted for the missing data. Most of the fields can be expected to match, but manual assistance may be needed for the next OID, - next transaction ID, WAL starting address, and database locale fields. - The first three of these can be set using the switches discussed below. + next transaction ID, next multi-transaction ID and offset, + WAL starting address, and database locale fields. + The first five of these can be set using the switches discussed below. pg_resetxlog's own environment is the source for its guess at the locale fields; take care that LANG and so forth match the environment that initdb was run in. @@ -74,9 +76,10 @@ PostgreSQL documentation - The -o, -x, -m, and -l + The -o, -x, -m, -O, + and -l switches allow the next OID, next transaction ID, next multi-transaction - ID, and WAL starting address values to + ID, next multi-transaction offset, and WAL starting address values to be set manually. These are only needed when pg_resetxlog is unable to determine appropriate values by reading pg_control. Safe values may be determined as @@ -108,6 +111,17 @@ PostgreSQL documentation + + + A safe value for the next multi-transaction offset (-O) + may be determined by looking for the numerically largest + file name in the directory pg_multixact/members under the + data directory, adding one, and then multiplying by 65536. As above, + the file names are in hexadecimal, so the easiest way to do this is to + specify the switch value in hexadecimal and add four zeroes. + + + The WAL starting address (-l) should be diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 433a4b4538..74f76c1d16 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.193 2005/06/06 20:22:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.194 2005/06/08 15:50:21 tgl Exp $ * * * INTERFACE ROUTINES @@ -2219,6 +2219,8 @@ l3: * Else the same IDs might be re-used after a crash, which would be * disastrous if this page made it to disk before the crash. Essentially * we have to enforce the WAL log-before-data rule even in this case. + * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG + * entries for everything anyway.) */ if (!relation->rd_istemp) { @@ -2228,6 +2230,8 @@ l3: xlrec.target.node = relation->rd_node; xlrec.target.tid = tuple->t_self; + xlrec.locking_xid = xid; + xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0); xlrec.shared_lock = (mode == LockTupleShared); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapLock; @@ -2900,17 +2904,18 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - /* - * Presently, we don't bother to restore the locked state, but - * just set the XMAX_INVALID bit. - */ htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); - htup->t_infomask |= HEAP_XMAX_INVALID; - HeapTupleHeaderSetXmax(htup, record->xl_xid); + if (xlrec->xid_is_mxact) + htup->t_infomask |= HEAP_XMAX_IS_MULTI; + if (xlrec->shared_lock) + htup->t_infomask |= HEAP_XMAX_SHARED_LOCK; + else + htup->t_infomask |= HEAP_XMAX_EXCL_LOCK; + HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->target.tid; @@ -3010,6 +3015,11 @@ heap_desc(char *buf, uint8 xl_info, char *rec) strcat(buf, "shared_lock: "); else strcat(buf, "exclusive_lock: "); + if (xlrec->xid_is_mxact) + strcat(buf, "mxid "); + else + strcat(buf, "xid "); + sprintf(buf + strlen(buf), "%u ", xlrec->locking_xid); out_target(buf, &(xlrec->target)); } else diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 85acfe2cc0..41773625af 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -10,7 +10,7 @@ * tuple to be unlocked can sleep on the potentially-several TransactionIds * that compose the MultiXactId. * - * We use two SLRU areas, one for storing the offsets on which the data + * We use two SLRU areas, one for storing the offsets at which the data * starts for each MultiXactId in the other one. This trick allows us to * store variable length arrays of TransactionIds. (We could alternatively * use one area containing counts and TransactionIds, with valid MultiXactId @@ -18,20 +18,31 @@ * since it would get completely confused if someone inquired about a bogus * MultiXactId that pointed to an intermediate slot containing an XID.) * - * This code is based on subtrans.c; see it for additional discussion. - * Like the subtransaction manager, we only need to remember multixact - * information for currently-open transactions. Thus, there is - * no need to preserve data over a crash and restart. + * XLOG interactions: this module generates an XLOG record whenever a new + * OFFSETs or MEMBERs page is initialized to zeroes, as well as an XLOG record + * whenever a new MultiXactId is defined. This allows us to completely + * rebuild the data entered since the last checkpoint during XLOG replay. + * Because this is possible, we need not follow the normal rule of + * "write WAL before data"; the only correctness guarantee needed is that + * we flush and sync all dirty OFFSETs and MEMBERs pages to disk before a + * checkpoint is considered complete. If a page does make it to disk ahead + * of corresponding WAL records, it will be forcibly zeroed before use anyway. + * Therefore, we don't need to mark our pages with LSN information; we have + * enough synchronization already. + * + * Like clog.c, and unlike subtrans.c, we have to preserve state across + * crashes and ensure that MXID and offset numbering increases monotonically + * across a crash. We do this in the same way as it's done for transaction + * IDs: the WAL record is guaranteed to contain evidence of every MXID we + * could need to worry about, and we just make sure that at the end of + * replay, the next-MXID and next-offset counters are at least as large as + * anything we saw during replay. * - * The only XLOG interaction we need to take care of is that generated - * MultiXactId values must continue to increase across a system crash. - * Thus we log groups of MultiXactIds acquisition in the same fashion we do - * for Oids (see XLogPutNextMultiXactId). * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.4 2005/05/19 21:35:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.5 2005/06/08 15:50:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -51,8 +62,8 @@ * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is * used everywhere else in Postgres. * - * Note: because both uint32 and TransactionIds are 32 bits and wrap around at - * 0xFFFFFFFF, MultiXact page numbering also wraps around at + * Note: because both MultiXactOffsets and TransactionIds are 32 bits and + * wrap around at 0xFFFFFFFF, MultiXact page numbering also wraps around at * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE, and segment numbering at * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need take no * explicit notice of that fact in this module, except when comparing segment @@ -61,21 +72,19 @@ */ /* We need four bytes per offset and also four bytes per member */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(uint32)) +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) #define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) #define MultiXactIdToOffsetPage(xid) \ - ((xid) / (uint32) MULTIXACT_OFFSETS_PER_PAGE) + ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) #define MultiXactIdToOffsetEntry(xid) \ - ((xid) % (uint32) MULTIXACT_OFFSETS_PER_PAGE) + ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) #define MXOffsetToMemberPage(xid) \ ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) #define MXOffsetToMemberEntry(xid) \ ((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) -/* Arbitrary number of MultiXactIds to allocate at each XLog call */ -#define MXACT_PREFETCH 8192 /* * Links to shared-memory data structures for MultiXact control @@ -98,11 +107,8 @@ typedef struct MultiXactStateData /* next-to-be-assigned MultiXactId */ MultiXactId nextMXact; - /* MultiXactIds we have left before logging more */ - uint32 mXactCount; - /* next-to-be-assigned offset */ - uint32 nextOffset; + MultiXactOffset nextOffset; /* the Offset SLRU area was last truncated at this MultiXactId */ MultiXactId lastTruncationPoint; @@ -161,7 +167,8 @@ static MultiXactId *OldestVisibleMXactId; * for this being that most entries will contain our own TransactionId and * so they will be uninteresting by the time our next transaction starts. * (XXX not clear that this is correct --- other members of the MultiXact - * could hang around longer than we did.) + * could hang around longer than we did. However, it's not clear what a + * better policy for flushing old cache entries would be.) * * We allocate the cache entries in a memory context that is deleted at * transaction end, so we don't need to do retail freeing of entries. @@ -194,7 +201,9 @@ static MemoryContext MXactContext = NULL; static void MultiXactIdSetOldestVisible(void); static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids); static int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids); -static MultiXactId GetNewMultiXactId(int nxids, uint32 *offset); +static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nxids, TransactionId *xids); +static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset); /* MultiXact cache management */ static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids); @@ -206,15 +215,17 @@ static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids); #endif /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int pageno); -static int ZeroMultiXactMemberPage(int pageno); +static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); +static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int page1, int page2); static bool MultiXactMemberPagePrecedes(int page1, int page2); static bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); -static bool MultiXactOffsetPrecedes(uint32 offset1, uint32 offset2); +static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, + MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); -static void ExtendMultiXactMember(uint32 offset); +static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); static void TruncateMultiXact(void); +static void WriteMZeroPageXlogRec(int pageno, uint8 info); /* @@ -551,8 +562,8 @@ MultiXactIdWait(MultiXactId multi) * CreateMultiXactId * Make a new MultiXactId * - * Make SLRU and cache entries for a new MultiXactId, recording the given - * TransactionIds as members. Returns the newly created MultiXactId. + * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the + * given TransactionIds as members. Returns the newly created MultiXactId. * * NB: the passed xids[] array will be sorted in-place. */ @@ -560,13 +571,9 @@ static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids) { MultiXactId multi; - int pageno; - int prev_pageno; - int entryno; - int slotno; - uint32 *offptr; - uint32 offset; - int i; + MultiXactOffset offset; + XLogRecData rdata[2]; + xl_multixact_create xlrec; debug_elog3(DEBUG2, "Create: %s", mxid_to_string(InvalidMultiXactId, nxids, xids)); @@ -588,11 +595,70 @@ CreateMultiXactId(int nxids, TransactionId *xids) return multi; } + /* + * OK, assign the MXID and offsets range to use + */ multi = GetNewMultiXactId(nxids, &offset); - LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + debug_elog4(DEBUG2, "Create: assigned id %u offset %u", multi, offset); + + /* + * Make an XLOG entry describing the new MXID. + * + * Note: we need not flush this XLOG entry to disk before proceeding. + * The only way for the MXID to be referenced from any data page is + * for heap_lock_tuple() to have put it there, and heap_lock_tuple() + * generates an XLOG record that must follow ours. The normal LSN + * interlock between the data page and that XLOG record will ensure + * that our XLOG record reaches disk first. If the SLRU members/offsets + * data reaches disk sooner than the XLOG record, we do not care because + * we'll overwrite it with zeroes unless the XLOG record is there too; + * see notes at top of this file. + */ + xlrec.mid = multi; + xlrec.moff = offset; + xlrec.nxids = nxids; + + rdata[0].data = (char *) (&xlrec); + rdata[0].len = MinSizeOfMultiXactCreate; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + rdata[1].data = (char *) xids; + rdata[1].len = nxids * sizeof(TransactionId); + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata); + + /* Now enter the information into the OFFSETs and MEMBERs logs */ + RecordNewMultiXact(multi, offset, nxids, xids); + + /* Store the new MultiXactId in the local cache, too */ + mXactCachePut(multi, nxids, xids); + + debug_elog2(DEBUG2, "Create: all done"); + + return multi; +} + +/* + * RecordNewMultiXact + * Write info about a new multixact into the offsets and members files + * + * This is broken out of CreateMultiXactId so that xlog replay can use it. + */ +static void +RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nxids, TransactionId *xids) +{ + int pageno; + int prev_pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + int i; - ExtendMultiXactOffset(multi); + LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -605,8 +671,9 @@ CreateMultiXactId(int nxids, TransactionId *xids) * we'll take the trouble to generalize the slru.c error reporting code. */ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); - offptr = (uint32 *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; + *offptr = offset; MultiXactOffsetCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY; @@ -614,8 +681,6 @@ CreateMultiXactId(int nxids, TransactionId *xids) /* Exchange our lock */ LWLockRelease(MultiXactOffsetControlLock); - debug_elog3(DEBUG2, "Create: got offset %u", offset); - LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); prev_pageno = -1; @@ -624,8 +689,6 @@ CreateMultiXactId(int nxids, TransactionId *xids) { TransactionId *memberptr; - ExtendMultiXactMember(offset); - pageno = MXOffsetToMemberPage(offset); entryno = MXOffsetToMemberEntry(offset); @@ -640,29 +703,27 @@ CreateMultiXactId(int nxids, TransactionId *xids) memberptr += entryno; *memberptr = xids[i]; + MultiXactMemberCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY; } LWLockRelease(MultiXactMemberControlLock); - - /* Store the new MultiXactId in the local cache, too */ - mXactCachePut(multi, nxids, xids); - - debug_elog2(DEBUG2, "Create: all done"); - - return multi; } /* * GetNewMultiXactId * Get the next MultiXactId. * - * Get the next MultiXactId, XLogging if needed. Also, reserve the needed - * amount of space in the "members" area. The starting offset of the - * reserved space is returned in *offset. + * Also, reserve the needed amount of space in the "members" area. The + * starting offset of the reserved space is returned in *offset. + * + * This may generate XLOG records for expansion of the offsets and/or members + * files. Unfortunately, we have to do that while holding MultiXactGenLock + * to avoid race conditions --- the XLOG record for zeroing a page must appear + * before any backend can possibly try to store data in that page! */ static MultiXactId -GetNewMultiXactId(int nxids, uint32 *offset) +GetNewMultiXactId(int nxids, MultiXactOffset *offset) { MultiXactId result; @@ -675,33 +736,33 @@ GetNewMultiXactId(int nxids, uint32 *offset) /* Handle wraparound of the nextMXact counter */ if (MultiXactState->nextMXact < FirstMultiXactId) - { MultiXactState->nextMXact = FirstMultiXactId; - MultiXactState->mXactCount = 0; - } - - /* If we run out of logged for use multixacts then we must log more */ - if (MultiXactState->mXactCount == 0) - { - XLogPutNextMultiXactId(MultiXactState->nextMXact + MXACT_PREFETCH); - MultiXactState->mXactCount = MXACT_PREFETCH; - } + /* + * Assign the MXID, and make sure there is room for it in the file. + */ result = MultiXactState->nextMXact; + ExtendMultiXactOffset(result); + /* + * Advance counter. As in GetNewTransactionId(), this must not happen + * until after ExtendMultiXactOffset has succeeded! + * * We don't care about MultiXactId wraparound here; it will be handled by * the next iteration. But note that nextMXact may be InvalidMultiXactId * after this routine exits, so anyone else looking at the variable must * be prepared to deal with that. */ (MultiXactState->nextMXact)++; - (MultiXactState->mXactCount)--; /* - * Reserve the members space. + * Reserve the members space. Same considerations as above. */ *offset = MultiXactState->nextOffset; + + ExtendMultiXactMember(*offset, nxids); + MultiXactState->nextOffset += nxids; LWLockRelease(MultiXactGenLock); @@ -725,13 +786,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) int prev_pageno; int entryno; int slotno; - uint32 *offptr; - uint32 offset; + MultiXactOffset *offptr; + MultiXactOffset offset; int length; int i; MultiXactId nextMXact; MultiXactId tmpMXact; - uint32 nextOffset; + MultiXactOffset nextOffset; TransactionId *ptr; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -799,7 +860,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) entryno = MultiXactIdToOffsetEntry(multi); slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); - offptr = (uint32 *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; offset = *offptr; @@ -829,7 +890,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) if (pageno != prev_pageno) slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, tmpMXact); - offptr = (uint32 *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; length = *offptr - offset; } @@ -1086,10 +1147,6 @@ MultiXactShmemInit(void) SimpleLruInit(MultiXactMemberCtl, "MultiXactMember Ctl", MultiXactMemberControlLock, "pg_multixact/members"); - /* Override default assumption that writes should be fsync'd */ - MultiXactOffsetCtl->do_fsync = false; - MultiXactMemberCtl->do_fsync = false; - /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", SHARED_MULTIXACT_STATE_SIZE, @@ -1116,10 +1173,6 @@ MultiXactShmemInit(void) * This func must be called ONCE on system install. It creates the initial * MultiXact segments. (The MultiXacts directories are assumed to have been * created by initdb, and MultiXactShmemInit must have been called already.) - * - * Note: it's not really necessary to create the initial segments now, - * since slru.c would create 'em on first write anyway. But we may as well - * do it to be sure the directories are set up correctly. */ void BootStrapMultiXact(void) @@ -1128,8 +1181,10 @@ BootStrapMultiXact(void) LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); - /* Offsets first page */ - slotno = ZeroMultiXactOffsetPage(0); + /* Create and zero the first page of the offsets log */ + slotno = ZeroMultiXactOffsetPage(0, false); + + /* Make sure it's written out */ SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); Assert(MultiXactOffsetCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN); @@ -1137,8 +1192,10 @@ BootStrapMultiXact(void) LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); - /* Members first page */ - slotno = ZeroMultiXactMemberPage(0); + /* Create and zero the first page of the members log */ + slotno = ZeroMultiXactMemberPage(0, false); + + /* Make sure it's written out */ SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); Assert(MultiXactMemberCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN); @@ -1147,6 +1204,7 @@ BootStrapMultiXact(void) /* * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. + * If writeXlog is TRUE, also emit an XLOG record saying we did this. * * The page is not actually written, just set up in shared memory. * The slot number of the new page is returned. @@ -1154,25 +1212,40 @@ BootStrapMultiXact(void) * Control lock must be held at entry, and will be held at exit. */ static int -ZeroMultiXactOffsetPage(int pageno) +ZeroMultiXactOffsetPage(int pageno, bool writeXlog) { - return SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + int slotno; + + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + + return slotno; } /* * Ditto, for MultiXactMember */ static int -ZeroMultiXactMemberPage(int pageno) +ZeroMultiXactMemberPage(int pageno, bool writeXlog) { - return SimpleLruZeroPage(MultiXactMemberCtl, pageno); + int slotno; + + slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); + + return slotno; } /* * This must be called ONCE during postmaster or standalone-backend startup. * - * StartupXLOG has already established nextMXact by calling - * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. + * StartupXLOG has already established nextMXact/nextOffset by calling + * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we + * may already have replayed WAL data into the SLRU files. * * We don't need any locks here, really; the SLRU locks are taken * only because slru.c expects to be called with locks held. @@ -1180,68 +1253,76 @@ ZeroMultiXactMemberPage(int pageno) void StartupMultiXact(void) { - int startPage; - int cutoffPage; - uint32 offset; + MultiXactId multi = MultiXactState->nextMXact; + MultiXactOffset offset = MultiXactState->nextOffset; + int pageno; + int entryno; + + /* Clean up offsets state */ + LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); /* - * We start nextOffset at zero after every reboot; there is no need to - * avoid offset values that were used in the previous system lifecycle. + * Initialize our idea of the latest page number. */ - MultiXactState->nextOffset = 0; + pageno = MultiXactIdToOffsetPage(multi); + MultiXactOffsetCtl->shared->latest_page_number = pageno; /* - * Because of the above, a shutdown and restart is likely to leave - * high-numbered MultiXactMember page files that would not get recycled - * for a long time (about as long as the system had been up in the - * previous cycle of life). To clean out such page files, we issue an - * artificial truncation call that will zap any page files in the first - * half of the offset cycle. Should there be any page files in the last - * half, they will get cleaned out by the first checkpoint. - * - * XXX it might be a good idea to disable this when debugging, since it - * will tend to destroy evidence after a crash. To not be *too* ruthless, - * we arbitrarily spare the first 64 pages. (Note this will get - * rounded off to a multiple of SLRU_PAGES_PER_SEGMENT ...) + * Zero out the remainder of the current offsets page. See notes + * in StartupCLOG() for motivation. */ - offset = ((~ (uint32) 0) >> 1) + 1; + entryno = MultiXactIdToOffsetEntry(multi); + if (entryno != 0) + { + int slotno; + MultiXactOffset *offptr; - cutoffPage = MXOffsetToMemberPage(offset) + 64; + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; - /* - * Defeat safety interlock in SimpleLruTruncate; this hack will be - * cleaned up by ZeroMultiXactMemberPage call below. - */ - MultiXactMemberCtl->shared->latest_page_number = cutoffPage; + MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); - SimpleLruTruncate(MultiXactMemberCtl, cutoffPage); + MultiXactOffsetCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY; + } + + LWLockRelease(MultiXactOffsetControlLock); + + /* And the same for members */ + LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* - * Initialize lastTruncationPoint to invalid, ensuring that the first - * checkpoint will try to do truncation. + * Initialize our idea of the latest page number. */ - MultiXactState->lastTruncationPoint = InvalidMultiXactId; + pageno = MXOffsetToMemberPage(offset); + MultiXactMemberCtl->shared->latest_page_number = pageno; /* - * Since we don't expect MultiXact to be valid across crashes, we - * initialize the currently-active pages to zeroes during startup. - * Whenever we advance into a new page, both ExtendMultiXact routines - * will likewise zero the new page without regard to whatever was - * previously on disk. + * Zero out the remainder of the current members page. See notes + * in StartupCLOG() for motivation. */ - LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); - - startPage = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - (void) ZeroMultiXactOffsetPage(startPage); + entryno = MXOffsetToMemberEntry(offset); + if (entryno != 0) + { + int slotno; + TransactionId *xidptr; - LWLockRelease(MultiXactOffsetControlLock); + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, offset); + xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno]; + xidptr += entryno; - LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); + MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId))); - startPage = MXOffsetToMemberPage(MultiXactState->nextOffset); - (void) ZeroMultiXactMemberPage(startPage); + MultiXactMemberCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY; + } LWLockRelease(MultiXactMemberControlLock); + + /* + * Initialize lastTruncationPoint to invalid, ensuring that the first + * checkpoint will try to do truncation. + */ + MultiXactState->lastTruncationPoint = InvalidMultiXactId; } /* @@ -1250,36 +1331,28 @@ StartupMultiXact(void) void ShutdownMultiXact(void) { - /* - * Flush dirty MultiXact pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely as a debugging aid. - */ + /* Flush dirty MultiXact pages to disk */ SimpleLruFlush(MultiXactOffsetCtl, false); SimpleLruFlush(MultiXactMemberCtl, false); } /* - * Get the next MultiXactId to save in a checkpoint record + * Get the next MultiXactId and offset to save in a checkpoint record */ -MultiXactId -MultiXactGetCheckptMulti(bool is_shutdown) +void +MultiXactGetCheckptMulti(bool is_shutdown, + MultiXactId *nextMulti, + MultiXactOffset *nextMultiOffset) { - MultiXactId retval; - LWLockAcquire(MultiXactGenLock, LW_SHARED); - retval = MultiXactState->nextMXact; - if (!is_shutdown) - retval += MultiXactState->mXactCount; + *nextMulti = MultiXactState->nextMXact; + *nextMultiOffset = MultiXactState->nextOffset; LWLockRelease(MultiXactGenLock); - debug_elog3(DEBUG2, "MultiXact: MultiXact for checkpoint record is %u", - retval); - - return retval; + debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u", + *nextMulti, *nextMultiOffset); } /* @@ -1288,62 +1361,68 @@ MultiXactGetCheckptMulti(bool is_shutdown) void CheckPointMultiXact(void) { - /* - * Flush dirty MultiXact pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely to improve the odds that writing of dirty pages is done - * by the checkpoint process and not by backends. - */ + /* Flush dirty MultiXact pages to disk */ SimpleLruFlush(MultiXactOffsetCtl, true); SimpleLruFlush(MultiXactMemberCtl, true); /* - * Truncate the SLRU files + * Truncate the SLRU files. This could be done at any time, but + * checkpoint seems a reasonable place for it. */ TruncateMultiXact(); } /* - * Set the next-to-be-assigned MultiXactId + * Set the next-to-be-assigned MultiXactId and offset * - * This is used when we can determine the correct next Id exactly - * from an XLog record. We need no locking since it is only called + * This is used when we can determine the correct next ID/offset exactly + * from a checkpoint record. We need no locking since it is only called * during bootstrap and XLog replay. */ void -MultiXactSetNextMXact(MultiXactId nextMulti) +MultiXactSetNextMXact(MultiXactId nextMulti, + MultiXactOffset nextMultiOffset) { - debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", nextMulti); + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + nextMulti, nextMultiOffset); MultiXactState->nextMXact = nextMulti; - MultiXactState->mXactCount = 0; + MultiXactState->nextOffset = nextMultiOffset; } /* - * Ensure the next-to-be-assigned MultiXactId is at least minMulti + * Ensure the next-to-be-assigned MultiXactId is at least minMulti, + * and similarly nextOffset is at least minMultiOffset * - * This is used when we can determine a minimum safe value - * from an XLog record. We need no locking since it is only called - * during XLog replay. + * This is used when we can determine minimum safe values from an XLog + * record (either an on-line checkpoint or an mxact creation log entry). + * We need no locking since it is only called during XLog replay. */ void -MultiXactAdvanceNextMXact(MultiXactId minMulti) +MultiXactAdvanceNextMXact(MultiXactId minMulti, + MultiXactOffset minMultiOffset) { if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) { debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); MultiXactState->nextMXact = minMulti; - MultiXactState->mXactCount = 0; + } + if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + { + debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + minMultiOffset); + MultiXactState->nextOffset = minMultiOffset; } } /* * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. * - * The MultiXactOffsetControlLock should be held at entry, and will - * be held at exit. + * NB: this is called while holding MultiXactGenLock. We want it to be very + * fast most of the time; even when it's not so fast, no actual I/O need + * happen unless we're forced to write out a dirty log or xlog page to make + * room in shared memory. */ -void +static void ExtendMultiXactOffset(MultiXactId multi) { int pageno; @@ -1358,32 +1437,56 @@ ExtendMultiXactOffset(MultiXactId multi) pageno = MultiXactIdToOffsetPage(multi); - /* Zero the page */ - ZeroMultiXactOffsetPage(pageno); + LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactOffsetPage(pageno, true); + + LWLockRelease(MultiXactOffsetControlLock); } /* * Make sure that MultiXactMember has room for the members of a newly- * allocated MultiXactId. * - * The MultiXactMemberControlLock should be held at entry, and will be held - * at exit. + * Like the above routine, this is called while holding MultiXactGenLock; + * same comments apply. */ -void -ExtendMultiXactMember(uint32 offset) +static void +ExtendMultiXactMember(MultiXactOffset offset, int nmembers) { - int pageno; - /* - * No work except at first entry of a page. + * It's possible that the members span more than one page of the + * members file, so we loop to ensure we consider each page. The + * coding is not optimal if the members span several pages, but + * that seems unusual enough to not worry much about. */ - if (MXOffsetToMemberEntry(offset) != 0) - return; + while (nmembers > 0) + { + int entryno; - pageno = MXOffsetToMemberPage(offset); + /* + * Only zero when at first entry of a page. + */ + entryno = MXOffsetToMemberEntry(offset); + if (entryno == 0) + { + int pageno; - /* Zero the page */ - ZeroMultiXactMemberPage(pageno); + pageno = MXOffsetToMemberPage(offset); + + LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactMemberPage(pageno, true); + + LWLockRelease(MultiXactMemberControlLock); + } + + /* Advance to next page (OK if nmembers goes negative) */ + offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno); + nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno); + } } /* @@ -1392,14 +1495,16 @@ ExtendMultiXactMember(uint32 offset) * * This is called only during checkpoints. We assume no more than one * backend does this at a time. + * + * XXX do we have any issues with needing to checkpoint here? */ static void TruncateMultiXact(void) { MultiXactId nextMXact; - uint32 nextOffset; + MultiXactOffset nextOffset; MultiXactId oldestMXact; - uint32 oldestOffset; + MultiXactOffset oldestOffset; int cutoffPage; int i; @@ -1460,7 +1565,7 @@ TruncateMultiXact(void) int pageno; int slotno; int entryno; - uint32 *offptr; + MultiXactOffset *offptr; LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); @@ -1468,7 +1573,7 @@ TruncateMultiXact(void) entryno = MultiXactIdToOffsetEntry(oldestMXact); slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, oldestMXact); - offptr = (uint32 *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; oldestOffset = *offptr; @@ -1529,11 +1634,11 @@ MultiXactOffsetPagePrecedes(int page1, int page2) static bool MultiXactMemberPagePrecedes(int page1, int page2) { - uint32 offset1; - uint32 offset2; + MultiXactOffset offset1; + MultiXactOffset offset2; - offset1 = ((uint32) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((uint32) page2) * MULTIXACT_MEMBERS_PER_PAGE; + offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; + offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; return MultiXactOffsetPrecedes(offset1, offset2); } @@ -1556,9 +1661,135 @@ MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) * Decide which of two offsets is earlier. */ static bool -MultiXactOffsetPrecedes(uint32 offset1, uint32 offset2) +MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) { int32 diff = (int32) (offset1 - offset2); return (diff < 0); } + + +/* + * Write an xlog record reflecting the zeroing of either a MEMBERs or + * OFFSETs page (info shows which) + * + * Note: xlog record is marked as outside transaction control, since we + * want it to be redone whether the invoking transaction commits or not. + */ +static void +WriteMZeroPageXlogRec(int pageno, uint8 info) +{ + XLogRecData rdata; + + rdata.data = (char *) (&pageno); + rdata.len = sizeof(int); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + (void) XLogInsert(RM_MULTIXACT_ID, info | XLOG_NO_TRAN, &rdata); +} + +/* + * MULTIXACT resource manager's routines + */ +void +multixact_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); + Assert(MultiXactOffsetCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN); + + LWLockRelease(MultiXactOffsetControlLock); + } + else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactMemberPage(pageno, false); + SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); + Assert(MultiXactMemberCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN); + + LWLockRelease(MultiXactMemberControlLock); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record); + TransactionId *xids = xlrec->xids; + TransactionId max_xid; + int i; + + /* Store the data back into the SLRU files */ + RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids); + + /* Make sure nextMXact/nextOffset are beyond what this record has */ + MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids); + + /* + * Make sure nextXid is beyond any XID mentioned in the record. + * This should be unnecessary, since any XID found here ought to + * have other evidence in the XLOG, but let's be safe. + */ + max_xid = record->xl_xid; + for (i = 0; i < xlrec->nxids; i++) + { + if (TransactionIdPrecedes(max_xid, xids[i])) + max_xid = xids[i]; + } + if (TransactionIdFollowsOrEquals(max_xid, + ShmemVariableCache->nextXid)) + { + ShmemVariableCache->nextXid = max_xid; + TransactionIdAdvance(ShmemVariableCache->nextXid); + } + } + else + elog(PANIC, "multixact_redo: unknown op code %u", info); +} + +void +multixact_desc(char *buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + sprintf(buf + strlen(buf), "zero offsets page: %d", pageno); + } + else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + sprintf(buf + strlen(buf), "zero members page: %d", pageno); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = (xl_multixact_create *) rec; + int i; + + sprintf(buf + strlen(buf), "create multixact %u offset %u:", + xlrec->mid, xlrec->moff); + for (i = 0; i < xlrec->nxids; i++) + sprintf(buf + strlen(buf), " %u", xlrec->xids[i]); + } + else + strcat(buf, "UNKNOWN"); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 39d647f8a6..5fe442fd80 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.18 2005/06/06 17:01:22 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.19 2005/06/08 15:50:26 tgl Exp $ */ #include "postgres.h" @@ -11,6 +11,7 @@ #include "access/gist_private.h" #include "access/hash.h" #include "access/heapam.h" +#include "access/multixact.h" #include "access/nbtree.h" #include "access/rtree.h" #include "access/xact.h" @@ -28,7 +29,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"CLOG", clog_redo, clog_desc, NULL, NULL}, {"Database", dbase_redo, dbase_desc, NULL, NULL}, {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL}, - {"Reserved 6", NULL, NULL, NULL, NULL}, + {"MultiXact", multixact_redo, multixact_desc, NULL, NULL}, {"Reserved 7", NULL, NULL, NULL, NULL}, {"Reserved 8", NULL, NULL, NULL, NULL}, {"Reserved 9", NULL, NULL, NULL, NULL}, diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index aa37244162..6c01c20eaa 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.197 2005/06/06 20:22:57 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.198 2005/06/08 15:50:26 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -3688,12 +3688,13 @@ BootStrapXLOG(void) checkPoint.nextXid = FirstNormalTransactionId; checkPoint.nextOid = FirstBootstrapObjectId; checkPoint.nextMulti = FirstMultiXactId; + checkPoint.nextMultiOffset = 0; checkPoint.time = time(NULL); ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; - MultiXactSetNextMXact(checkPoint.nextMulti); + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); /* Set up the XLOG page header */ page->xlp_magic = XLOG_PAGE_MAGIC; @@ -4344,8 +4345,11 @@ StartupXLOG(void) checkPoint.undo.xlogid, checkPoint.undo.xrecoff, wasShutdown ? "TRUE" : "FALSE"))); ereport(LOG, - (errmsg("next transaction ID: %u; next OID: %u; next MultiXactId: %u", - checkPoint.nextXid, checkPoint.nextOid, checkPoint.nextMulti))); + (errmsg("next transaction ID: %u; next OID: %u", + checkPoint.nextXid, checkPoint.nextOid))); + ereport(LOG, + (errmsg("next MultiXactId: %u; next MultiXactOffset: %u", + checkPoint.nextMulti, checkPoint.nextMultiOffset))); if (!TransactionIdIsNormal(checkPoint.nextXid)) ereport(PANIC, (errmsg("invalid next transaction ID"))); @@ -4353,7 +4357,7 @@ StartupXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; - MultiXactSetNextMXact(checkPoint.nextMulti); + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); /* * We must replay WAL entries using the same TimeLineID they were @@ -5080,7 +5084,9 @@ CreateCheckPoint(bool shutdown, bool force) checkPoint.nextOid += ShmemVariableCache->oidCount; LWLockRelease(OidGenLock); - checkPoint.nextMulti = MultiXactGetCheckptMulti(shutdown); + MultiXactGetCheckptMulti(shutdown, + &checkPoint.nextMulti, + &checkPoint.nextMultiOffset); /* * Having constructed the checkpoint record, ensure all shmem disk @@ -5228,25 +5234,6 @@ XLogPutNextOid(Oid nextOid) */ } -/* - * Write a NEXT_MULTIXACT log record - */ -void -XLogPutNextMultiXactId(MultiXactId nextMulti) -{ - XLogRecData rdata; - - rdata.data = (char *) (&nextMulti); - rdata.len = sizeof(MultiXactId); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTMULTI, &rdata); - /* - * We do not flush here either; this assumes that heap_lock_tuple() will - * always generate a WAL record. See notes therein. - */ -} - /* * XLOG resource manager's routines */ @@ -5266,14 +5253,6 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ShmemVariableCache->oidCount = 0; } } - else if (info == XLOG_NEXTMULTI) - { - MultiXactId nextMulti; - - memcpy(&nextMulti, XLogRecGetData(record), sizeof(MultiXactId)); - - MultiXactAdvanceNextMXact(nextMulti); - } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -5283,7 +5262,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; - MultiXactSetNextMXact(checkPoint.nextMulti); + MultiXactSetNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); /* * TLI may change in a shutdown checkpoint, but it shouldn't @@ -5315,7 +5295,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; } - MultiXactAdvanceNextMXact(checkPoint.nextMulti); + MultiXactAdvanceNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); /* TLI should not change in an on-line checkpoint */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, @@ -5335,12 +5316,13 @@ xlog_desc(char *buf, uint8 xl_info, char *rec) CheckPoint *checkpoint = (CheckPoint *) rec; sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; " - "tli %u; xid %u; oid %u; multi %u; %s", + "tli %u; xid %u; oid %u; multi %u; offset %u; %s", checkpoint->redo.xlogid, checkpoint->redo.xrecoff, checkpoint->undo.xlogid, checkpoint->undo.xrecoff, checkpoint->ThisTimeLineID, checkpoint->nextXid, checkpoint->nextOid, checkpoint->nextMulti, + checkpoint->nextMultiOffset, (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); } else if (info == XLOG_NEXTOID) @@ -5350,13 +5332,6 @@ xlog_desc(char *buf, uint8 xl_info, char *rec) memcpy(&nextOid, rec, sizeof(Oid)); sprintf(buf + strlen(buf), "nextOid: %u", nextOid); } - else if (info == XLOG_NEXTMULTI) - { - MultiXactId multi; - - memcpy(&multi, rec, sizeof(MultiXactId)); - sprintf(buf + strlen(buf), "nextMultiXact: %u", multi); - } else strcat(buf, "UNKNOWN"); } diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 77f61af06f..b1aed8f421 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -6,7 +6,7 @@ * copyright (c) Oliver Elphick , 2001; * licence: BSD * - * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.24 2005/06/02 05:55:29 tgl Exp $ + * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.25 2005/06/08 15:50:27 tgl Exp $ */ #include "postgres.h" @@ -166,6 +166,7 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's NextXID: %u\n"), ControlFile.checkPointCopy.nextXid); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); + printf(_("Latest checkpoint's NextMultiOffset: %u\n"), ControlFile.checkPointCopy.nextMultiOffset); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); printf(_("Database block size: %u\n"), ControlFile.blcksz); printf(_("Blocks per segment of large relation: %u\n"), ControlFile.relseg_size); diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index 6eceb0a354..15c291b1ee 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -23,7 +23,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.33 2005/06/02 05:55:29 tgl Exp $ + * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.34 2005/06/08 15:50:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -77,6 +77,7 @@ main(int argc, char *argv[]) TransactionId set_xid = 0; Oid set_oid = 0; MultiXactId set_mxid = 0; + MultiXactOffset set_mxoff = -1; uint32 minXlogTli = 0, minXlogId = 0, minXlogSeg = 0; @@ -106,7 +107,7 @@ main(int argc, char *argv[]) } - while ((c = getopt(argc, argv, "fl:m:no:x:")) != -1) + while ((c = getopt(argc, argv, "fl:m:no:O:x:")) != -1) { switch (c) { @@ -163,6 +164,21 @@ main(int argc, char *argv[]) } break; + case 'O': + set_mxoff = strtoul(optarg, &endptr, 0); + if (endptr == optarg || *endptr != '\0') + { + fprintf(stderr, _("%s: invalid argument for option -O\n"), progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + if (set_mxoff == -1) + { + fprintf(stderr, _("%s: multi transaction offset (-O) must not be -1\n"), progname); + exit(1); + } + break; + case 'l': minXlogTli = strtoul(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',') @@ -265,6 +281,9 @@ main(int argc, char *argv[]) if (set_mxid != 0) ControlFile.checkPointCopy.nextMulti = set_mxid; + if (set_mxoff != -1) + ControlFile.checkPointCopy.nextMultiOffset = set_mxoff; + if (minXlogTli > ControlFile.checkPointCopy.ThisTimeLineID) ControlFile.checkPointCopy.ThisTimeLineID = minXlogTli; @@ -426,6 +445,7 @@ GuessControlValues(void) ControlFile.checkPointCopy.nextXid = (TransactionId) 514; /* XXX */ ControlFile.checkPointCopy.nextOid = FirstBootstrapObjectId; ControlFile.checkPointCopy.nextMulti = FirstMultiXactId; + ControlFile.checkPointCopy.nextMultiOffset = 0; ControlFile.checkPointCopy.time = time(NULL); ControlFile.state = DB_SHUTDOWNED; @@ -463,7 +483,7 @@ GuessControlValues(void) /* * XXX eventually, should try to grovel through old XLOG to develop - * more accurate values for TimeLineID, nextXID, and nextOID. + * more accurate values for TimeLineID, nextXID, etc. */ } @@ -500,6 +520,7 @@ PrintControlValues(bool guessed) printf(_("Latest checkpoint's NextXID: %u\n"), ControlFile.checkPointCopy.nextXid); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); + printf(_("Latest checkpoint's NextMultiOffset: %u\n"), ControlFile.checkPointCopy.nextMultiOffset); printf(_("Database block size: %u\n"), ControlFile.blcksz); printf(_("Blocks per segment of large relation: %u\n"), ControlFile.relseg_size); printf(_("Maximum length of identifiers: %u\n"), ControlFile.nameDataLen); @@ -777,6 +798,7 @@ usage(void) printf(_(" -o OID set next OID\n")); printf(_(" -x XID set next transaction ID\n")); printf(_(" -m multiXID set next multi transaction ID\n")); + printf(_(" -O multiOffset set next multi transaction offset\n")); printf(_(" --help show this help, then exit\n")); printf(_(" --version output version information, then exit\n")); printf(_("\nReport bugs to .\n")); diff --git a/src/include/access/htup.h b/src/include/access/htup.h index adeb05fd56..e394afd313 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.74 2005/04/28 21:47:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.75 2005/06/08 15:50:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -505,6 +505,8 @@ typedef struct xl_heap_newpage typedef struct xl_heap_lock { xl_heaptid target; /* locked tuple id */ + TransactionId locking_xid; /* might be a MultiXactId not xid */ + bool xid_is_mxact; /* is it? */ bool shared_lock; /* shared or exclusive row lock? */ } xl_heap_lock; diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 65d19704c4..2199b05f2c 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -6,16 +6,38 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/multixact.h,v 1.2 2005/05/03 19:42:41 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/multixact.h,v 1.3 2005/06/08 15:50:28 tgl Exp $ */ #ifndef MULTIXACT_H #define MULTIXACT_H +#include "access/xlog.h" + #define InvalidMultiXactId ((MultiXactId) 0) #define FirstMultiXactId ((MultiXactId) 1) #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) +/* ---------------- + * multixact-related XLOG entries + * ---------------- + */ + +#define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00 +#define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10 +#define XLOG_MULTIXACT_CREATE_ID 0x20 + +typedef struct xl_multixact_create +{ + MultiXactId mid; /* new MultiXact's ID */ + MultiXactOffset moff; /* its starting offset in members file */ + int32 nxids; /* number of member XIDs */ + TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */ +} xl_multixact_create; + +#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids) + + extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2); extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid); extern bool MultiXactIdIsRunning(MultiXactId multi); @@ -29,9 +51,16 @@ extern void MultiXactShmemInit(void); extern void BootStrapMultiXact(void); extern void StartupMultiXact(void); extern void ShutdownMultiXact(void); -extern MultiXactId MultiXactGetCheckptMulti(bool is_shutdown); +extern void MultiXactGetCheckptMulti(bool is_shutdown, + MultiXactId *nextMulti, + MultiXactOffset *nextMultiOffset); extern void CheckPointMultiXact(void); -extern void MultiXactSetNextMXact(MultiXactId nextMulti); -extern void MultiXactAdvanceNextMXact(MultiXactId minMulti); +extern void MultiXactSetNextMXact(MultiXactId nextMulti, + MultiXactOffset nextMultiOffset); +extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, + MultiXactOffset minMultiOffset); + +extern void multixact_redo(XLogRecPtr lsn, XLogRecord *record); +extern void multixact_desc(char *buf, uint8 xl_info, char *rec); #endif /* MULTIXACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 7547d7f5b9..ead4619b02 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.64 2005/06/06 20:22:58 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.65 2005/06/08 15:50:28 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -165,7 +165,6 @@ extern void ShutdownXLOG(int code, Datum arg); extern void InitXLOGAccess(void); extern void CreateCheckPoint(bool shutdown, bool force); extern void XLogPutNextOid(Oid nextOid); -extern void XLogPutNextMultiXactId(MultiXactId multi); extern XLogRecPtr GetRedoRecPtr(void); #endif /* XLOG_H */ diff --git a/src/include/c.h b/src/include/c.h index 6318c5573d..1a92038774 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -12,7 +12,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/c.h,v 1.184 2005/05/25 21:40:41 momjian Exp $ + * $PostgreSQL: pgsql/src/include/c.h,v 1.185 2005/06/08 15:50:28 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -388,6 +388,8 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; +typedef uint32 MultiXactOffset; + typedef uint32 CommandId; #define FirstCommandId ((CommandId) 0) diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 3f96b6bf26..73f32b55ad 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.22 2005/06/02 05:55:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.23 2005/06/08 15:50:28 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,7 +22,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 810 +#define PG_CONTROL_VERSION 811 /* * Body of CheckPoint XLOG records. This is declared here because we keep @@ -40,13 +40,13 @@ typedef struct CheckPoint TransactionId nextXid; /* next free XID */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ + MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ time_t time; /* time stamp of checkpoint */ } CheckPoint; /* XLOG info values for XLOG rmgr */ #define XLOG_CHECKPOINT_SHUTDOWN 0x00 #define XLOG_CHECKPOINT_ONLINE 0x10 -#define XLOG_NEXTMULTI 0x20 #define XLOG_NEXTOID 0x30 -- 2.40.0