From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 11 Aug 2004 04:07:16 +0000 (+0000)
Subject: Fix failure to guarantee that a checkpoint will write out pg_clog updates
X-Git-Tag: REL8_0_0BETA2~169
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3fdf649f4fc8a21ba4cec1db7f3fe7bb1105b00c;p=postgresql

Fix failure to guarantee that a checkpoint will write out pg_clog updates
for transaction commits that occurred just before the checkpoint.  This is
an EXTREMELY serious bug --- kudos to Satoshi Okada for creating a
reproducible test case to prove its existence.
---

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 4794c761c3..594a2fcca1 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.177 2004/08/03 15:57:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.178 2004/08/11 04:07:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -574,13 +574,28 @@ RecordTransactionCommit(void)
 		START_CRIT_SECTION();
 
 		/*
-		 * We only need to log the commit in XLOG if the transaction made
-		 * any transaction-controlled XLOG entries or will delete files.
+		 * If our transaction made any transaction-controlled XLOG entries,
+		 * we need to lock out checkpoint start between writing our XLOG
+		 * record and updating pg_clog.  Otherwise it is possible for the
+		 * checkpoint to set REDO after the XLOG record but fail to flush the
+		 * pg_clog update to disk, leading to loss of the transaction commit
+		 * if we crash a little later.  Slightly klugy fix for problem
+		 * discovered 2004-08-10.
+		 *
 		 * (If it made no transaction-controlled XLOG entries, its XID
 		 * appears nowhere in permanent storage, so no one else will ever care
-		 * if it committed.)
+		 * if it committed; so it doesn't matter if we lose the commit flag.)
+		 *
+		 * Note we only need a shared lock.
 		 */
 		madeTCentries = (MyLastRecPtr.xrecoff != 0);
+		if (madeTCentries)
+			LWLockAcquire(CheckpointStartLock, LW_SHARED);
+
+		/*
+		 * We only need to log the commit in XLOG if the transaction made
+		 * any transaction-controlled XLOG entries or will delete files.
+		 */
 		if (madeTCentries || nrels > 0)
 		{
 			XLogRecData rdata[3];
@@ -668,6 +683,10 @@ RecordTransactionCommit(void)
 			TransactionIdCommitTree(nchildren, children);
 		}
 
+		/* Unlock checkpoint lock if we acquired it */
+		if (madeTCentries)
+			LWLockRelease(CheckpointStartLock);
+
 		END_CRIT_SECTION();
 	}
 
@@ -850,6 +869,8 @@ RecordTransactionAbort(void)
 		 *
 		 * We do not flush XLOG to disk unless deleting files, since the
 		 * default assumption after a crash would be that we aborted, anyway.
+		 * For the same reason, we don't need to worry about interlocking
+		 * against checkpoint start.
 		 */
 		if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
 		{
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 28fb4c733a..32ade5d759 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.158 2004/08/09 16:26:01 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.159 2004/08/11 04:07:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4699,6 +4699,15 @@ CreateCheckPoint(bool shutdown, bool force)
 	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.time = time(NULL);
 
+	/*
+	 * We must hold CheckpointStartLock while determining the checkpoint
+	 * REDO pointer.  This ensures that any concurrent transaction commits
+	 * will be either not yet logged, or logged and recorded in pg_clog.
+	 * See notes in RecordTransactionCommit().
+	 */
+	LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
+
+	/* And we need WALInsertLock too */
 	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
 	/*
@@ -4731,6 +4740,7 @@ CreateCheckPoint(bool shutdown, bool force)
 			ControlFile->checkPointCopy.redo.xrecoff)
 		{
 			LWLockRelease(WALInsertLock);
+			LWLockRelease(CheckpointStartLock);
 			LWLockRelease(CheckpointLock);
 			END_CRIT_SECTION();
 			return;
@@ -4789,6 +4799,9 @@ CreateCheckPoint(bool shutdown, bool force)
 	 * GetSnapshotData needs to get XidGenLock while holding SInvalLock,
 	 * so there's a risk of deadlock. Need to find a better solution.  See
 	 * pgsql-hackers discussion of 17-Dec-01.
+	 *
+	 * XXX actually, the whole UNDO code is dead code and unlikely to ever
+	 * be revived, so the lack of a good solution here is not troubling.
 	 */
 #ifdef NOT_USED
 	checkPoint.undo = GetUndoRecPtr();
@@ -4798,11 +4811,13 @@ CreateCheckPoint(bool shutdown, bool force)
 #endif
 
 	/*
-	 * Now we can release insert lock, allowing other xacts to proceed
-	 * even while we are flushing disk buffers.
+	 * Now we can release insert lock and checkpoint start lock, allowing
+	 * other xacts to proceed even while we are flushing disk buffers.
 	 */
 	LWLockRelease(WALInsertLock);
 
+	LWLockRelease(CheckpointStartLock);
+
 	/*
 	 * Get the other info we need for the checkpoint record.
 	 */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index b1f6fc9510..7b08231e51 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.12 2004/06/11 16:43:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.13 2004/08/11 04:07:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,6 +36,7 @@ typedef enum LWLockId
 	WALWriteLock,
 	ControlFileLock,
 	CheckpointLock,
+	CheckpointStartLock,
 	RelCacheInitLock,
 	BgWriterCommLock,