1 /*-------------------------------------------------------------------------
4 * PostgreSQL transaction-commit-log manager
6 * This module replaces the old "pg_log" access code, which treated pg_log
7 * essentially like a relation, in that it went through the regular buffer
8 * manager. The problem with that was that there wasn't any good way to
9 * recycle storage space for transactions so old that they'll never be
10 * looked up again. Now we use specialized access code so that the commit
11 * log can be broken into relatively small, independent segments.
13 * XLOG interactions: this module generates an XLOG record whenever a new
14 * CLOG page is initialized to zeroes. Other writes of CLOG come from
15 * recording of transaction commit or abort in xact.c, which generates its
16 * own XLOG records for these events and will re-perform the status update
17 * on redo; so we need make no additional XLOG entry here. For synchronous
18 * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
19 * record before we are called to log a commit, so the WAL rule "write xlog
20 * before data" is satisfied automatically. However, for async commits we
21 * must track the latest LSN affecting each CLOG page, so that we can flush
22 * XLOG that far and satisfy the WAL rule. We don't have to worry about this
23 * for aborts (whether sync or async), since the post-crash assumption would
24 * be that such transactions failed anyway.
26 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
29 * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.47 2008/08/01 13:16:08 alvherre Exp $
31 *-------------------------------------------------------------------------
35 #include "access/clog.h"
36 #include "access/slru.h"
37 #include "access/transam.h"
39 #include "postmaster/bgwriter.h"
42 * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
43 * everywhere else in Postgres.
45 * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
46 * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
47 * and CLOG segment numbering at 0xFFFFFFFF/CLOG_XACTS_PER_SEGMENT. We need
48 * take no explicit notice of that fact in this module, except when comparing
49 * segment and page numbers in TruncateCLOG (see CLOGPagePrecedes).
52 /* We need two bits per xact, so four xacts fit in a byte */
53 #define CLOG_BITS_PER_XACT 2
54 #define CLOG_XACTS_PER_BYTE 4
55 #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
56 #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
58 #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
59 #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
60 #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
61 #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
63 /* We store the latest async LSN for each group of transactions */
64 #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
65 #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
67 #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
68 ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
72 * Link to shared-memory data structures for CLOG control
74 static SlruCtlData ClogCtlData;
76 #define ClogCtl (&ClogCtlData)
79 static int ZeroCLOGPage(int pageno, bool writeXlog);
80 static bool CLOGPagePrecedes(int page1, int page2);
81 static void WriteZeroPageXlogRec(int pageno);
82 static void WriteTruncateXlogRec(int pageno);
86 * Record the final state of a transaction in the commit log.
88 * lsn must be the WAL location of the commit record when recording an async
89 * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
90 * caller guarantees the commit record is already flushed in that case. It
91 * should be InvalidXLogRecPtr for abort cases, too.
93 * NB: this is a low-level routine and is NOT the preferred entry point
94 * for most uses; TransactionLogUpdate() in transam.c is the intended caller.
97 TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
99 int pageno = TransactionIdToPage(xid);
100 int byteno = TransactionIdToByte(xid);
101 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
106 Assert(status == TRANSACTION_STATUS_COMMITTED ||
107 status == TRANSACTION_STATUS_ABORTED ||
108 status == TRANSACTION_STATUS_SUB_COMMITTED);
110 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
113 * If we're doing an async commit (ie, lsn is valid), then we must wait
114 * for any active write on the page slot to complete. Otherwise our
115 * update could reach disk in that write, which will not do since we
116 * mustn't let it reach disk until we've done the appropriate WAL flush.
117 * But when lsn is invalid, it's OK to scribble on a page while it is
118 * write-busy, since we don't care if the update reaches disk sooner than
119 * we think. Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
121 slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
122 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
124 /* Current state should be 0, subcommitted or target state */
125 Assert(((*byteptr >> bshift) & CLOG_XACT_BITMASK) == 0 ||
126 ((*byteptr >> bshift) & CLOG_XACT_BITMASK) == TRANSACTION_STATUS_SUB_COMMITTED ||
127 ((*byteptr >> bshift) & CLOG_XACT_BITMASK) == status);
129 /* note this assumes exclusive access to the clog page */
131 byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
132 byteval |= (status << bshift);
135 ClogCtl->shared->page_dirty[slotno] = true;
138 * Update the group LSN if the transaction completion LSN is higher.
140 * Note: lsn will be invalid when supplied during InRecovery processing,
141 * so we don't need to do anything special to avoid LSN updates during
142 * recovery. After recovery completes the next clog change will set the
145 if (!XLogRecPtrIsInvalid(lsn))
147 int lsnindex = GetLSNIndex(slotno, xid);
149 if (XLByteLT(ClogCtl->shared->group_lsn[lsnindex], lsn))
150 ClogCtl->shared->group_lsn[lsnindex] = lsn;
153 LWLockRelease(CLogControlLock);
157 * Interrogate the state of a transaction in the commit log.
159 * Aside from the actual commit status, this function returns (into *lsn)
160 * an LSN that is late enough to be able to guarantee that if we flush up to
161 * that LSN then we will have flushed the transaction's commit record to disk.
162 * The result is not necessarily the exact LSN of the transaction's commit
163 * record! For example, for long-past transactions (those whose clog pages
164 * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
165 * we group transactions on the same clog page to conserve storage, we might
166 * return the LSN of a later transaction that falls into the same group.
168 * NB: this is a low-level routine and is NOT the preferred entry point
169 * for most uses; TransactionLogFetch() in transam.c is the intended caller.
172 TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
174 int pageno = TransactionIdToPage(xid);
175 int byteno = TransactionIdToByte(xid);
176 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
182 /* lock is acquired by SimpleLruReadPage_ReadOnly */
184 slotno = SimpleLruReadPage_ReadOnly(ClogCtl, pageno, xid);
185 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
187 status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
189 lsnindex = GetLSNIndex(slotno, xid);
190 *lsn = ClogCtl->shared->group_lsn[lsnindex];
192 LWLockRelease(CLogControlLock);
199 * Initialization of shared memory for CLOG
204 return SimpleLruShmemSize(NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE);
210 ClogCtl->PagePrecedes = CLOGPagePrecedes;
211 SimpleLruInit(ClogCtl, "CLOG Ctl", NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE,
212 CLogControlLock, "pg_clog");
216 * This func must be called ONCE on system install. It creates
217 * the initial CLOG segment. (The CLOG directory is assumed to
218 * have been created by the initdb shell script, and CLOGShmemInit
219 * must have been called already.)
226 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
228 /* Create and zero the first page of the commit log */
229 slotno = ZeroCLOGPage(0, false);
231 /* Make sure it's written out */
232 SimpleLruWritePage(ClogCtl, slotno, NULL);
233 Assert(!ClogCtl->shared->page_dirty[slotno]);
235 LWLockRelease(CLogControlLock);
239 * Initialize (or reinitialize) a page of CLOG to zeroes.
240 * If writeXlog is TRUE, also emit an XLOG record saying we did this.
242 * The page is not actually written, just set up in shared memory.
243 * The slot number of the new page is returned.
245 * Control lock must be held at entry, and will be held at exit.
248 ZeroCLOGPage(int pageno, bool writeXlog)
252 slotno = SimpleLruZeroPage(ClogCtl, pageno);
255 WriteZeroPageXlogRec(pageno);
261 * This must be called ONCE during postmaster or standalone-backend startup,
262 * after StartupXLOG has initialized ShmemVariableCache->nextXid.
267 TransactionId xid = ShmemVariableCache->nextXid;
268 int pageno = TransactionIdToPage(xid);
270 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
273 * Initialize our idea of the latest page number.
275 ClogCtl->shared->latest_page_number = pageno;
278 * Zero out the remainder of the current clog page. Under normal
279 * circumstances it should be zeroes already, but it seems at least
280 * theoretically possible that XLOG replay will have settled on a nextXID
281 * value that is less than the last XID actually used and marked by the
282 * previous database lifecycle (since subtransaction commit writes clog
283 * but makes no WAL entry). Let's just be safe. (We need not worry about
284 * pages beyond the current one, since those will be zeroed when first
285 * used. For the same reason, there is no need to do anything when
286 * nextXid is exactly at a page boundary; and it's likely that the
287 * "current" page doesn't exist yet in that case.)
289 if (TransactionIdToPgIndex(xid) != 0)
291 int byteno = TransactionIdToByte(xid);
292 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
296 slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid);
297 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
299 /* Zero so-far-unused positions in the current byte */
300 *byteptr &= (1 << bshift) - 1;
301 /* Zero the rest of the page */
302 MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
304 ClogCtl->shared->page_dirty[slotno] = true;
307 LWLockRelease(CLogControlLock);
311 * This must be called ONCE during postmaster or standalone-backend shutdown
316 /* Flush dirty CLOG pages to disk */
317 TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false);
318 SimpleLruFlush(ClogCtl, false);
319 TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false);
323 * Perform a checkpoint --- either during shutdown, or on-the-fly
328 /* Flush dirty CLOG pages to disk */
329 TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
330 SimpleLruFlush(ClogCtl, true);
331 TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
336 * Make sure that CLOG has room for a newly-allocated XID.
338 * NB: this is called while holding XidGenLock. We want it to be very fast
339 * most of the time; even when it's not so fast, no actual I/O need happen
340 * unless we're forced to write out a dirty clog or xlog page to make room
344 ExtendCLOG(TransactionId newestXact)
349 * No work except at first XID of a page. But beware: just after
350 * wraparound, the first XID of page zero is FirstNormalTransactionId.
352 if (TransactionIdToPgIndex(newestXact) != 0 &&
353 !TransactionIdEquals(newestXact, FirstNormalTransactionId))
356 pageno = TransactionIdToPage(newestXact);
358 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
360 /* Zero the page and make an XLOG entry about it */
361 ZeroCLOGPage(pageno, true);
363 LWLockRelease(CLogControlLock);
368 * Remove all CLOG segments before the one holding the passed transaction ID
370 * Before removing any CLOG data, we must flush XLOG to disk, to ensure
371 * that any recently-emitted HEAP_FREEZE records have reached disk; otherwise
372 * a crash and restart might leave us with some unfrozen tuples referencing
373 * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
374 * Replaying the deletion from XLOG is not critical, since the files could
375 * just as well be removed later, but doing so prevents a long-running hot
376 * standby server from acquiring an unreasonably bloated CLOG directory.
378 * Since CLOG segments hold a large number of transactions, the opportunity to
379 * actually remove a segment is fairly rare, and so it seems best not to do
380 * the XLOG flush unless we have confirmed that there is a removable segment.
383 TruncateCLOG(TransactionId oldestXact)
388 * The cutoff point is the start of the segment containing oldestXact. We
389 * pass the *page* containing oldestXact to SimpleLruTruncate.
391 cutoffPage = TransactionIdToPage(oldestXact);
393 /* Check to see if there's any files that could be removed */
394 if (!SlruScanDirectory(ClogCtl, cutoffPage, false))
395 return; /* nothing to remove */
397 /* Write XLOG record and flush XLOG to disk */
398 WriteTruncateXlogRec(cutoffPage);
400 /* Now we can remove the old CLOG segment(s) */
401 SimpleLruTruncate(ClogCtl, cutoffPage);
406 * Decide which of two CLOG page numbers is "older" for truncation purposes.
408 * We need to use comparison of TransactionIds here in order to do the right
409 * thing with wraparound XID arithmetic. However, if we are asked about
410 * page number zero, we don't want to hand InvalidTransactionId to
411 * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So,
412 * offset both xids by FirstNormalTransactionId to avoid that.
415 CLOGPagePrecedes(int page1, int page2)
420 xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
421 xid1 += FirstNormalTransactionId;
422 xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
423 xid2 += FirstNormalTransactionId;
425 return TransactionIdPrecedes(xid1, xid2);
430 * Write a ZEROPAGE xlog record
433 WriteZeroPageXlogRec(int pageno)
437 rdata.data = (char *) (&pageno);
438 rdata.len = sizeof(int);
439 rdata.buffer = InvalidBuffer;
441 (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE, &rdata);
445 * Write a TRUNCATE xlog record
447 * We must flush the xlog record to disk before returning --- see notes
451 WriteTruncateXlogRec(int pageno)
456 rdata.data = (char *) (&pageno);
457 rdata.len = sizeof(int);
458 rdata.buffer = InvalidBuffer;
460 recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE, &rdata);
465 * CLOG resource manager's routines
468 clog_redo(XLogRecPtr lsn, XLogRecord *record)
470 uint8 info = record->xl_info & ~XLR_INFO_MASK;
472 if (info == CLOG_ZEROPAGE)
477 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
479 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
481 slotno = ZeroCLOGPage(pageno, false);
482 SimpleLruWritePage(ClogCtl, slotno, NULL);
483 Assert(!ClogCtl->shared->page_dirty[slotno]);
485 LWLockRelease(CLogControlLock);
487 else if (info == CLOG_TRUNCATE)
491 memcpy(&pageno, XLogRecGetData(record), sizeof(int));
494 * During XLOG replay, latest_page_number isn't set up yet; insert a
495 * suitable value to bypass the sanity test in SimpleLruTruncate.
497 ClogCtl->shared->latest_page_number = pageno;
499 SimpleLruTruncate(ClogCtl, pageno);
502 elog(PANIC, "clog_redo: unknown op code %u", info);
506 clog_desc(StringInfo buf, uint8 xl_info, char *rec)
508 uint8 info = xl_info & ~XLR_INFO_MASK;
510 if (info == CLOG_ZEROPAGE)
514 memcpy(&pageno, rec, sizeof(int));
515 appendStringInfo(buf, "zeropage: %d", pageno);
517 else if (info == CLOG_TRUNCATE)
521 memcpy(&pageno, rec, sizeof(int));
522 appendStringInfo(buf, "truncate before: %d", pageno);
525 appendStringInfo(buf, "UNKNOWN");