From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 8 Nov 2000 22:10:03 +0000 (+0000)
Subject: Make DROP TABLE rollback-able: postpone physical file delete until commit.
X-Git-Tag: REL7_1_BETA~224
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3908473c80;p=postgresql

Make DROP TABLE rollback-able: postpone physical file delete until commit.
(WAL logging for this is not done yet, however.)  Clean up a number of really
crufty things that are no longer needed now that DROP behaves nicely.  Make
temp table mapper do the right things when drop or rename affecting a temp
table is rolled back.  Also, remove "relation modified while in use" error
check, in favor of locking tables at first reference and holding that lock
throughout the statement.
---

diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index 9b59947a88..1ed2366efd 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/common/tupdesc.c,v 1.67 2000/10/05 19:48:20 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/common/tupdesc.c,v 1.68 2000/11/08 22:09:53 tgl Exp $
  *
  * NOTES
  *	  some of the executor utility code such as "ExecTypeFromTL" should be
@@ -228,7 +228,9 @@ FreeTupleDesc(TupleDesc tupdesc)
 bool
 equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
 {
-	int			i;
+	int			i,
+				j,
+				n;
 
 	if (tupdesc1->natts != tupdesc2->natts)
 		return false;
@@ -240,7 +242,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
 		/*
 		 * We do not need to check every single field here, and in fact
 		 * some fields such as attdispersion probably shouldn't be
-		 * compared.
+		 * compared.  We can also disregard attnum (it was used to
+		 * place the row in the attrs array) and everything derived
+		 * from the column datatype.
 		 */
 		if (strcmp(NameStr(attr1->attname), NameStr(attr2->attname)) != 0)
 			return false;
@@ -260,32 +264,53 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
 
 		if (constr2 == NULL)
 			return false;
-		if (constr1->num_defval != constr2->num_defval)
+		if (constr1->has_not_null != constr2->has_not_null)
+			return false;
+		n = constr1->num_defval;
+		if (n != (int) constr2->num_defval)
 			return false;
-		for (i = 0; i < (int) constr1->num_defval; i++)
+		for (i = 0; i < n; i++)
 		{
 			AttrDefault *defval1 = constr1->defval + i;
-			AttrDefault *defval2 = constr2->defval + i;
+			AttrDefault *defval2 = constr2->defval;
 
-			if (defval1->adnum != defval2->adnum)
+			/*
+			 * We can't assume that the items are always read from the
+			 * system catalogs in the same order; so use the adnum field to
+			 * identify the matching item to compare.
+			 */
+			for (j = 0; j < n; defval2++, j++)
+			{
+				if (defval1->adnum == defval2->adnum)
+					break;
+			}
+			if (j >= n)
 				return false;
 			if (strcmp(defval1->adbin, defval2->adbin) != 0)
 				return false;
 		}
-		if (constr1->num_check != constr2->num_check)
+		n = constr1->num_check;
+		if (n != (int) constr2->num_check)
 			return false;
-		for (i = 0; i < (int) constr1->num_check; i++)
+		for (i = 0; i < n; i++)
 		{
 			ConstrCheck *check1 = constr1->check + i;
-			ConstrCheck *check2 = constr2->check + i;
+			ConstrCheck *check2 = constr2->check;
 
-			if (strcmp(check1->ccname, check2->ccname) != 0)
-				return false;
-			if (strcmp(check1->ccbin, check2->ccbin) != 0)
+			/*
+			 * Similarly, don't assume that the checks are always read
+			 * in the same order; match them up by name and contents.
+			 * (The name *should* be unique, but...)
+			 */
+			for (j = 0; j < n; check2++, j++)
+			{
+				if (strcmp(check1->ccname, check2->ccname) == 0 &&
+					strcmp(check1->ccbin, check2->ccbin) == 0)
+					break;
+			}
+			if (j >= n)
 				return false;
 		}
-		if (constr1->has_not_null != constr2->has_not_null)
-			return false;
 	}
 	else if (tupdesc2->constr != NULL)
 		return false;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 560f28743f..d7bfeb1287 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -6,7 +6,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.63 2000/10/21 15:43:09 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.64 2000/11/08 22:09:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -266,13 +266,12 @@ gistbuild(PG_FUNCTION_ARGS)
 	{
 		Oid			hrelid = RelationGetRelid(heap);
 		Oid			irelid = RelationGetRelid(index);
-		bool		inplace = IsReindexProcessing();
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups, inplace);
-		UpdateStats(irelid, nitups, inplace);
-		if (oldPred != NULL && !inplace)
+		UpdateStats(hrelid, nhtups);
+		UpdateStats(irelid, nitups);
+		if (oldPred != NULL)
 		{
 			if (nitups == nhtups)
 				pred = NULL;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 8db80d5154..333199a898 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.43 2000/10/21 15:43:11 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.44 2000/11/08 22:09:54 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -217,13 +217,12 @@ hashbuild(PG_FUNCTION_ARGS)
 	{
 		Oid			hrelid = RelationGetRelid(heap);
 		Oid			irelid = RelationGetRelid(index);
-		bool		inplace = IsReindexProcessing();
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups, inplace);
-		UpdateStats(irelid, nitups, inplace);
-		if (oldPred != NULL && !inplace)
+		UpdateStats(hrelid, nhtups);
+		UpdateStats(irelid, nitups);
+		if (oldPred != NULL)
 		{
 			if (nitups == nhtups)
 				pred = NULL;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e796762ef1..5f450f9152 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.92 2000/10/29 18:33:39 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.93 2000/11/08 22:09:54 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -780,19 +780,13 @@ heap_beginscan(Relation relation,
 
 	/* ----------------
 	 *	increment relation ref count while scanning relation
-	 * ----------------
-	 */
-	RelationIncrementReferenceCount(relation);
-
-	/* ----------------
-	 *	Acquire AccessShareLock for the duration of the scan
 	 *
-	 *	Note: we could get an SI inval message here and consequently have
-	 *	to rebuild the relcache entry.	The refcount increment above
-	 *	ensures that we will rebuild it and not just flush it...
+	 *	This is just to make really sure the relcache entry won't go away
+	 *	while the scan has a pointer to it.  Caller should be holding the
+	 *	rel open anyway, so this is redundant in all normal scenarios...
 	 * ----------------
 	 */
-	LockRelation(relation, AccessShareLock);
+	RelationIncrementReferenceCount(relation);
 
 	/* XXX someday assert SelfTimeQual if relkind == RELKIND_UNCATALOGED */
 	if (relation->rd_rel->relkind == RELKIND_UNCATALOGED)
@@ -809,13 +803,11 @@ heap_beginscan(Relation relation,
 	scan->rs_snapshot = snapshot;
 	scan->rs_nkeys = (short) nkeys;
 
+	/*
+	 * we do this here instead of in initscan() because heap_rescan
+	 * also calls initscan() and we don't want to allocate memory again
+	 */
 	if (nkeys)
-
-		/*
-		 * we do this here instead of in initscan() because heap_rescan
-		 * also calls initscan() and we don't want to allocate memory
-		 * again
-		 */
 		scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
 	else
 		scan->rs_key = NULL;
@@ -841,8 +833,6 @@ heap_rescan(HeapScanDesc scan,
 	IncrHeapAccessStat(local_rescan);
 	IncrHeapAccessStat(global_rescan);
 
-	/* Note: set relation level read lock is still set */
-
 	/* ----------------
 	 *	unpin scan buffers
 	 * ----------------
@@ -853,7 +843,7 @@ heap_rescan(HeapScanDesc scan,
 	 *	reinitialize scan descriptor
 	 * ----------------
 	 */
-	scan->rs_atend = (bool) scanFromEnd;
+	scan->rs_atend = scanFromEnd;
 	initscan(scan, scan->rs_rd, scanFromEnd, scan->rs_nkeys, key);
 }
 
@@ -882,12 +872,6 @@ heap_endscan(HeapScanDesc scan)
 	 */
 	unpinscan(scan);
 
-	/* ----------------
-	 *	Release AccessShareLock acquired by heap_beginscan()
-	 * ----------------
-	 */
-	UnlockRelation(scan->rs_rd, AccessShareLock);
-
 	/* ----------------
 	 *	decrement relation reference count and free scan descriptor storage
 	 * ----------------
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fb437ac99f..ab942844af 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.69 2000/11/01 20:39:58 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.70 2000/11/08 22:09:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -340,19 +340,16 @@ btbuild(PG_FUNCTION_ARGS)
 	{
 		Oid			hrelid = RelationGetRelid(heap);
 		Oid			irelid = RelationGetRelid(index);
-		bool		inplace = IsReindexProcessing();
 
 		heap_close(heap, NoLock);
 		index_close(index);
-
-		UpdateStats(hrelid, nhtups, inplace);
-		UpdateStats(irelid, nitups, inplace);
+		UpdateStats(hrelid, nhtups);
+		UpdateStats(irelid, nitups);
 		if (oldPred != NULL)
 		{
 			if (nitups == nhtups)
 				pred = NULL;
-			if (!inplace)
-				UpdateIndexPredicate(irelid, oldPred, pred);
+			UpdateIndexPredicate(irelid, oldPred, pred);
 		}
 	}
 
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
index 60f6a2f6ca..ee5f621c0c 100644
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.54 2000/10/21 15:43:20 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.55 2000/11/08 22:09:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -237,13 +237,12 @@ rtbuild(PG_FUNCTION_ARGS)
 	{
 		Oid			hrelid = RelationGetRelid(heap);
 		Oid			irelid = RelationGetRelid(index);
-		bool		inplace = IsReindexProcessing();
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups, inplace);
-		UpdateStats(irelid, nitups, inplace);
-		if (oldPred != NULL && !inplace)
+		UpdateStats(hrelid, nhtups);
+		UpdateStats(irelid, nitups);
+		if (oldPred != NULL)
 		{
 			if (nitups == nhtups)
 				pred = NULL;
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index f29b41d749..8c7d98ca70 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.31 2000/11/03 11:39:35 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.32 2000/11/08 22:09:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -130,7 +130,7 @@ VariableRelationPutNextXid(TransactionId xid)
 
 	TransactionIdStore(xid, &(var->nextXidData));
 
-	FlushBuffer(buf, TRUE);
+	FlushBuffer(buf, true, true);
 }
 
 /* --------------------------------
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index cc4209fa51..97ff91fc44 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.80 2000/11/05 22:50:19 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.81 2000/11/08 22:09:55 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -167,6 +167,7 @@
 #include "miscadmin.h"
 #include "storage/proc.h"
 #include "storage/sinval.h"
+#include "storage/smgr.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/portal.h"
@@ -1105,6 +1106,9 @@ CommitTransaction(void)
 	}
 
 	RelationPurgeLocalRelation(true);
+	AtEOXact_temp_relations(true);
+	smgrDoPendingDeletes(true);
+
 	AtEOXact_SPI();
 	AtEOXact_nbtree();
 	AtCommit_Cache();
@@ -1181,8 +1185,11 @@ AbortTransaction(void)
 	CloseSequences();
 	AtEOXact_portals();
 	RecordTransactionAbort();
+
 	RelationPurgeLocalRelation(false);
-	remove_temp_rel_in_myxid();
+	AtEOXact_temp_relations(false);
+	smgrDoPendingDeletes(false);
+
 	AtEOXact_SPI();
 	AtEOXact_nbtree();
 	AtAbort_Cache();
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 1fdc7e83fb..b644278733 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -171,9 +171,8 @@ XLogOpenLogRelation(void)
 	sprintf(RelationGetPhysicalRelationName(logRelation), "pg_log");
 	logRelation->rd_node.tblNode = InvalidOid;
 	logRelation->rd_node.relNode = RelOid_pg_log;
-	logRelation->rd_unlinked = false;	/* must exists */
 	logRelation->rd_fd = -1;
-	logRelation->rd_fd = smgropen(DEFAULT_SMGR, logRelation);
+	logRelation->rd_fd = smgropen(DEFAULT_SMGR, logRelation, false);
 	if (logRelation->rd_fd < 0)
 		elog(STOP, "XLogOpenLogRelation: failed to open pg_log");
 	LogRelation = logRelation;
@@ -384,9 +383,9 @@ XLogOpenRelation(bool redo, RmgrId rmid, RelFileNode rnode)
 
 		hentry->rdesc = res;
 
-		res->reldata.rd_unlinked = true;	/* look smgropen */
 		res->reldata.rd_fd = -1;
-		res->reldata.rd_fd = smgropen(DEFAULT_SMGR, &(res->reldata));
+		res->reldata.rd_fd = smgropen(DEFAULT_SMGR, &(res->reldata),
+									  true /* allow failure */);
 	}
 
 	res->moreRecently = &(_xlrelarr[0]);
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index de4cc3dd99..e4e26d0c3c 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/bootstrap/bootstrap.c,v 1.96 2000/11/04 12:43:23 petere Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/bootstrap/bootstrap.c,v 1.97 2000/11/08 22:09:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1151,7 +1151,7 @@ build_indices()
 		 * -mer
 		 */
 		if (!BootstrapAlreadySeen(RelationGetRelid(heap)))
-			UpdateStats(RelationGetRelid(heap), 0, true);
+			UpdateStats(RelationGetRelid(heap), 0);
 
 		/* XXX Probably we ought to close the heap and index here? */
 	}
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index f2df9c3c07..2c4a9e515b 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.150 2000/10/22 23:32:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.151 2000/11/08 22:09:56 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -289,8 +289,7 @@ heap_create(char *relname,
 	 */
 	rel = (Relation) palloc(sizeof(RelationData));
 	MemSet((char *) rel, 0, sizeof(RelationData));
-	rel->rd_fd = -1;			/* table is not open */
-	rel->rd_unlinked = true;	/* table is not created yet */
+	rel->rd_fd = -1;			/* physical file is not open */
 
 	RelationSetReferenceCount(rel, 1);
 
@@ -345,8 +344,6 @@ heap_create(char *relname,
 	 *	have the storage manager create the relation.
 	 * ----------------
 	 */
-
-	/* smgrcreate() is moved to heap_storage_create() */
 	if (storage_create)
 		heap_storage_create(rel);
 
@@ -355,18 +352,12 @@ heap_create(char *relname,
 	return rel;
 }
 
-bool
+void
 heap_storage_create(Relation rel)
 {
-	bool		smgrcall = false;
-
-	if (rel->rd_unlinked)
-	{
-		rel->rd_fd = (File) smgrcreate(DEFAULT_SMGR, rel);
-		rel->rd_unlinked = false;
-		smgrcall = true;
-	}
-	return smgrcall;
+	Assert(rel->rd_fd < 0);
+	rel->rd_fd = smgrcreate(DEFAULT_SMGR, rel);
+	Assert(rel->rd_fd >= 0);
 }
 
 /* ----------------------------------------------------------------
@@ -1062,7 +1053,11 @@ RelationRemoveIndexes(Relation relation)
 						  &entry);
 
 	while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
+	{
 		index_drop(((Form_pg_index) GETSTRUCT(tuple))->indexrelid);
+		/* advance cmd counter to make catalog changes visible */
+		CommandCounterIncrement();
+	}
 
 	heap_endscan(scan);
 	heap_close(indexRelation, RowExclusiveLock);
@@ -1165,10 +1160,10 @@ RelationTruncateIndexes(Oid heapId)
 		LockRelation(currentIndex, AccessExclusiveLock);
 
 		/*
-		 * Release any buffers associated with this index.	If they're
+		 * Drop any buffers associated with this index.	If they're
 		 * dirty, they're just dropped without bothering to flush to disk.
 		 */
-		ReleaseRelationBuffers(currentIndex);
+		DropRelationBuffers(currentIndex);
 
 		/* Now truncate the actual data and set blocks to zero */
 		smgrtruncate(DEFAULT_SMGR, currentIndex, 0);
@@ -1212,24 +1207,19 @@ heap_truncate(char *relname)
 	/* ----------------
 	 *	TRUNCATE TABLE within a transaction block is dangerous, because
 	 *	if the transaction is later rolled back we have no way to
-	 *	undo truncation of the relation's physical file.  For now, allow it
-	 *	but emit a warning message.
-	 *	Someday we might want to consider postponing the physical truncate
-	 *	until transaction commit, but that's a lot of work...
-	 *	The only case that actually works right is for relations created
-	 *	in the current transaction, since the post-abort state would be that
-	 *	they don't exist anyway.  So, no warning in that case.
+	 *	undo truncation of the relation's physical file.  Disallow it
+	 *	except for a rel created in the current xact (which would be deleted
+	 *	on abort, anyway).
 	 * ----------------
 	 */
 	if (IsTransactionBlock() && !rel->rd_myxactonly)
-		elog(NOTICE, "Caution: TRUNCATE TABLE cannot be rolled back, so don't abort now");
+		elog(ERROR, "TRUNCATE TABLE cannot run inside a BEGIN/END block");
 
 	/*
 	 * Release any buffers associated with this relation.  If they're
 	 * dirty, they're just dropped without bothering to flush to disk.
 	 */
-
-	ReleaseRelationBuffers(rel);
+	DropRelationBuffers(rel);
 
 	/* Now truncate the actual data and set blocks to zero */
 
@@ -1416,8 +1406,9 @@ heap_drop_with_catalog(const char *relname,
 {
 	Relation	rel;
 	Oid			rid;
-	bool		istemp = (get_temp_rel_by_username(relname) != NULL);
 	bool		has_toasttable;
+	bool		istemp = (get_temp_rel_by_username(relname) != NULL);
+	int			i;
 
 	/* ----------------
 	 *	Open and lock the relation.
@@ -1425,6 +1416,7 @@ heap_drop_with_catalog(const char *relname,
 	 */
 	rel = heap_openr(relname, AccessExclusiveLock);
 	rid = RelationGetRelid(rel);
+	has_toasttable = rel->rd_rel->reltoastrelid != InvalidOid;
 
 	/* ----------------
 	 *	prevent deletion of system relations
@@ -1433,46 +1425,40 @@ heap_drop_with_catalog(const char *relname,
 	/* allow temp of pg_class? Guess so. */
 	if (!istemp && !allow_system_table_mods &&
 		IsSystemRelationName(RelationGetRelationName(rel)))
-		elog(ERROR, "System relation '%s' cannot be destroyed",
+		elog(ERROR, "System relation \"%s\" may not be dropped",
 			 RelationGetRelationName(rel));
 
 	/* ----------------
-	 *	DROP TABLE within a transaction block is dangerous, because
-	 *	if the transaction is later rolled back there will be no way to
-	 *	undo the unlink of the relation's physical file.  For now, allow it
-	 *	but emit a warning message.
-	 *	Someday we might want to consider postponing the physical unlink
-	 *	until transaction commit, but that's a lot of work...
-	 *	The only case that actually works right is for relations created
-	 *	in the current transaction, since the post-abort state would be that
-	 *	they don't exist anyway.  So, no warning in that case.
+	 * Release all buffers that belong to this relation, after writing
+	 * any that are dirty
 	 * ----------------
 	 */
-	if (IsTransactionBlock() && !rel->rd_myxactonly)
-		elog(NOTICE, "Caution: DROP TABLE cannot be rolled back, so don't abort now");
+	i = FlushRelationBuffers(rel, (BlockNumber) 0);
+	if (i < 0)
+		elog(ERROR, "heap_drop_with_catalog: FlushRelationBuffers returned %d",
+			 i);
 
 	/* ----------------
-	 *	remove inheritance information
+	 *	remove rules if necessary
 	 * ----------------
 	 */
-	RelationRemoveInheritance(rel);
+	if (rel->rd_rules != NULL)
+		RelationRemoveRules(rid);
+
+	/* triggers */
+	RelationRemoveTriggers(rel);
 
 	/* ----------------
-	 *	remove indexes if necessary
+	 *	remove inheritance information
 	 * ----------------
 	 */
-	/* should ignore relhasindex */
-	RelationRemoveIndexes(rel);
+	RelationRemoveInheritance(rel);
 
 	/* ----------------
-	 *	remove rules if necessary
+	 *	remove indexes if necessary
 	 * ----------------
 	 */
-	if (rel->rd_rules != NULL)
-		RelationRemoveRules(rid);
-
-	/* triggers */
-	RelationRemoveTriggers(rel);
+	RelationRemoveIndexes(rel);
 
 	/* ----------------
 	 *	delete attribute tuples
@@ -1502,23 +1488,12 @@ heap_drop_with_catalog(const char *relname,
 	 */
 	DeleteRelationTuple(rel);
 
-	/*
-	 * release dirty buffers of this relation; don't bother to write them
-	 */
-	ReleaseRelationBuffers(rel);
-
 	/* ----------------
 	 *	unlink the relation's physical file and finish up.
 	 * ----------------
 	 */
-	if (rel->rd_rel->relkind != RELKIND_VIEW && ! rel->rd_unlinked)
+	if (rel->rd_rel->relkind != RELKIND_VIEW)
 		smgrunlink(DEFAULT_SMGR, rel);
-	rel->rd_unlinked = true;
-
-	/*
-	 * Remember if there is a toast relation for below
-	 */
-	has_toasttable = rel->rd_rel->reltoastrelid != InvalidOid;
 
 	/*
 	 * Close relcache entry, but *keep* AccessExclusiveLock on the
@@ -1533,6 +1508,7 @@ heap_drop_with_catalog(const char *relname,
 	 */
 	RelationForgetRelation(rid);
 
+	/* and from the temp-table map */
 	if (istemp)
 		remove_temp_rel_by_relid(rid);
 
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 33aa67fe45..3833c961f4 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.128 2000/10/11 21:28:18 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.129 2000/11/08 22:09:56 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -26,6 +26,7 @@
 #include "access/heapam.h"
 #include "access/istrat.h"
 #include "bootstrap/bootstrap.h"
+#include "catalog/catalog.h"
 #include "catalog/catname.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
@@ -43,10 +44,10 @@
 #include "utils/builtins.h"
 #include "utils/catcache.h"
 #include "utils/fmgroids.h"
+#include "utils/inval.h"
 #include "utils/relcache.h"
 #include "utils/syscache.h"
 #include "utils/temprel.h"
-#include "utils/inval.h"
 
 /*
  * macros used in guessing how many tuples are on a page.
@@ -927,6 +928,13 @@ index_create(char *heapRelationName,
 	indexRelation = heap_create(indexRelationName, indexTupDesc,
 								istemp, false, allow_system_table_mods);
 
+	/*
+	 * Obtain exclusive lock on it.  Although no other backends can see it
+	 * until we commit, this prevents deadlock-risk complaints from lock
+	 * manager in cases such as CLUSTER.
+	 */
+	LockRelation(indexRelation, AccessExclusiveLock);
+
 	/* ----------------
 	 *	  construct the index relation descriptor
 	 *
@@ -990,7 +998,8 @@ index_create(char *heapRelationName,
 	 *
 	 * In normal processing mode, the heap and index relations are closed by
 	 * index_build() --- but we continue to hold the ShareLock on the heap
-	 * that we acquired above, until end of transaction.
+	 * and the exclusive lock on the index that we acquired above, until
+	 * end of transaction.
 	 */
 	if (IsBootstrapProcessingMode())
 	{
@@ -1020,6 +1029,7 @@ index_drop(Oid indexId)
 	Relation	attributeRelation;
 	HeapTuple	tuple;
 	int16		attnum;
+	int			i;
 
 	Assert(OidIsValid(indexId));
 
@@ -1040,19 +1050,11 @@ index_drop(Oid indexId)
 	LockRelation(userIndexRelation, AccessExclusiveLock);
 
 	/* ----------------
-	 *	DROP INDEX within a transaction block is dangerous, because
-	 *	if the transaction is later rolled back there will be no way to
-	 *	undo the unlink of the relation's physical file.  For now, allow it
-	 *	but emit a warning message.
-	 *	Someday we might want to consider postponing the physical unlink
-	 *	until transaction commit, but that's a lot of work...
-	 *	The only case that actually works right is for relations created
-	 *	in the current transaction, since the post-abort state would be that
-	 *	they don't exist anyway.  So, no warning in that case.
+	 *	Note: unlike heap_drop_with_catalog, we do not need to prevent
+	 *	deletion of system indexes here; that's checked for upstream.
+	 *	If we did check it here, deletion of TOAST tables would fail...
 	 * ----------------
 	 */
-	if (IsTransactionBlock() && !userIndexRelation->rd_myxactonly)
-		elog(NOTICE, "Caution: DROP INDEX cannot be rolled back, so don't abort now");
 
 	/* ----------------
 	 * fix DESCRIPTION relation
@@ -1077,20 +1079,14 @@ index_drop(Oid indexId)
 	heap_freetuple(tuple);
 
 	/*
-	 * Find the pg_class tuple for the owning relation.  We do not attempt
-	 * to clear relhasindex, since we are too lazy to test whether any other
-	 * indexes remain (the next VACUUM will fix it if necessary).  But we
-	 * must send out a shared-cache-inval notice on the owning relation
-	 * to ensure other backends update their relcache lists of indexes.
+	 * Update the pg_class tuple for the owning relation.  We are presently
+	 * too lazy to attempt to compute the new correct value of relhasindex
+	 * (the next VACUUM will fix it if necessary).  But we must send out a
+	 * shared-cache-inval notice on the owning relation to ensure other
+	 * backends update their relcache lists of indexes.  So, unconditionally
+	 * do setRelhasindex(true).
 	 */
-	tuple = SearchSysCacheTupleCopy(RELOID,
-									ObjectIdGetDatum(heapId),
-									0, 0, 0);
-
-	Assert(HeapTupleIsValid(tuple));
-
-	ImmediateInvalidateSharedHeapTuple(relationRelation, tuple);
-	heap_freetuple(tuple);
+	setRelhasindex(heapId, true);
 
 	heap_close(relationRelation, RowExclusiveLock);
 
@@ -1131,10 +1127,11 @@ index_drop(Oid indexId)
 	/*
 	 * flush buffer cache and physically remove the file
 	 */
-	ReleaseRelationBuffers(userIndexRelation);
+	i = FlushRelationBuffers(userIndexRelation, (BlockNumber) 0);
+	if (i < 0)
+		elog(ERROR, "index_drop: FlushRelationBuffers returned %d", i);
 
-	if (smgrunlink(DEFAULT_SMGR, userIndexRelation) != SM_SUCCESS)
-		elog(ERROR, "index_drop: unlink: %m");
+	smgrunlink(DEFAULT_SMGR, userIndexRelation);
 
 	/*
 	 * Close rels, but keep locks
@@ -1144,7 +1141,7 @@ index_drop(Oid indexId)
 
 	RelationForgetRelation(indexId);
 
-	/* does something only if it is a temp index */
+	/* if it's a temp index, clear the temp mapping table entry */
 	remove_temp_rel_by_relid(indexId);
 }
 
@@ -1331,7 +1328,11 @@ LockClassinfoForUpdate(Oid relid, HeapTuple rtup,
 		return false;
 	rtup->t_self = classTuple->t_self;
 	pgcform = (Form_pg_class) GETSTRUCT(classTuple);
-	relationRelation = heap_openr(RelationRelationName, RowShareLock);
+	/*
+	 * NOTE: get and hold RowExclusiveLock on pg_class, because caller will
+	 * probably modify the rel's pg_class tuple later on.
+	 */
+	relationRelation = heap_openr(RelationRelationName, RowExclusiveLock);
 	test = heap_mark4update(relationRelation, rtup, buffer);
 	switch (test)
 	{
@@ -1388,57 +1389,38 @@ IndexesAreActive(Oid relid, bool confirmCommitted)
 	if (!heap_getnext(scan, 0))
 		isactive = true;
 	heap_endscan(scan);
-	heap_close(indexRelation, NoLock);
+	heap_close(indexRelation, AccessShareLock);
 	return isactive;
 }
 
 /* ----------------
- *		set relhasindex of pg_class in place
+ *		set relhasindex of relation's pg_class entry
+ *
+ * NOTE: an important side-effect of this operation is that an SI invalidation
+ * message is sent out to all backends --- including me --- causing relcache
+ * entries to be flushed or updated with the new hasindex data.
+ * Therefore, we execute the update even if relhasindex has the right value
+ * already.  Possible future improvement: skip the disk update and just send
+ * an SI message in that case.
  * ----------------
  */
 void
-setRelhasindexInplace(Oid relid, bool hasindex, bool immediate)
+setRelhasindex(Oid relid, bool hasindex)
 {
-	Relation	whichRel;
 	Relation	pg_class;
 	HeapTuple	tuple;
-	Form_pg_class rd_rel;
 	HeapScanDesc pg_class_scan = NULL;
 
-	/* ----------------
-	 * This routine handles updates for only the heap relation
-	 * hasindex. In order to guarantee that we're able to *see* the index
-	 * relation tuple, we bump the command counter id here.
-	 * ----------------
-	 */
-	CommandCounterIncrement();
-
-	/* ----------------
-	 * CommandCounterIncrement() flushes invalid cache entries, including
-	 * those for the heap and index relations for which we're updating
-	 * statistics.	Now that the cache is flushed, it's safe to open the
-	 * relation again.	We need the relation open in order to figure out
-	 * how many blocks it contains.
-	 * ----------------
-	 */
-
-	whichRel = heap_open(relid, ShareLock);
-
-	if (!RelationIsValid(whichRel))
-		elog(ERROR, "setRelhasindexInplace: cannot open relation id %u", relid);
-
-	/* ----------------
-	 * Find the RELATION relation tuple for the given relation.
-	 * ----------------
+	/*
+	 * Find the tuple to update in pg_class.
 	 */
 	pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
-	if (!RelationIsValid(pg_class))
-		elog(ERROR, "setRelhasindexInplace: could not open RELATION relation");
 
 	if (!IsIgnoringSystemIndexes())
 	{
 		tuple = SearchSysCacheTupleCopy(RELOID,
-										ObjectIdGetDatum(relid), 0, 0, 0);
+										ObjectIdGetDatum(relid),
+										0, 0, 0);
 	}
 	else
 	{
@@ -1458,72 +1440,46 @@ setRelhasindexInplace(Oid relid, bool hasindex, bool immediate)
 		if (pg_class_scan)
 			heap_endscan(pg_class_scan);
 		heap_close(pg_class, RowExclusiveLock);
-		elog(ERROR, "setRelhasindexInplace: cannot scan RELATION relation");
-	}
-
-	/*
-	 * Confirm that target tuple is locked by this transaction in case of
-	 * immediate updation.
-	 */
-	if (immediate)
-	{
-		HeapTupleHeader th = tuple->t_data;
-
-		if (!(th->t_infomask & HEAP_XMIN_COMMITTED))
-			elog(ERROR, "Immediate hasindex updation can be done only for committed tuples %x", th->t_infomask);
-		if (th->t_infomask & HEAP_XMAX_INVALID)
-			elog(ERROR, "Immediate hasindex updation can be done only for locked tuples %x", th->t_infomask);
-		if (th->t_infomask & HEAP_XMAX_COMMITTED)
-			elog(ERROR, "Immediate hasindex updation can be done only for locked tuples %x", th->t_infomask);
-		if (!(th->t_infomask & HEAP_MARKED_FOR_UPDATE))
-			elog(ERROR, "Immediate hasindex updation can be done only for locked tuples %x", th->t_infomask);
-		if (!(TransactionIdIsCurrentTransactionId(th->t_xmax)))
-			elog(ERROR, "The updating tuple is already locked by another backend");
+		elog(ERROR, "setRelhasindex: cannot find relation %u in pg_class",
+			 relid);
 	}
 
-	/*
-	 * We shouldn't have to do this, but we do...  Modify the reldesc in
-	 * place with the new values so that the cache contains the latest
-	 * copy.
-	 */
-	whichRel->rd_rel->relhasindex = hasindex;
-
 	/* ----------------
 	 *	Update hasindex in pg_class.
 	 * ----------------
 	 */
+	((Form_pg_class) GETSTRUCT(tuple))->relhasindex = hasindex;
+
 	if (pg_class_scan)
 	{
-		rd_rel = (Form_pg_class) GETSTRUCT(tuple);
-		rd_rel->relhasindex = hasindex;
+		/* Write the modified tuple in-place */
 		WriteNoReleaseBuffer(pg_class_scan->rs_cbuf);
+		/* Send out shared cache inval if necessary */
+		if (!IsBootstrapProcessingMode())
+			RelationInvalidateHeapTuple(pg_class, tuple);
 	}
 	else
 	{
-		HeapTupleData htup;
-		Buffer		buffer;
-
-		htup.t_self = tuple->t_self;
-		heap_fetch(pg_class, SnapshotNow, &htup, &buffer);
-		rd_rel = (Form_pg_class) GETSTRUCT(&htup);
-		rd_rel->relhasindex = hasindex;
-		WriteBuffer(buffer);
-	}
+		heap_update(pg_class, &tuple->t_self, tuple, NULL);
 
-	/*
-	 * Send out a shared-cache-inval message so other backends notice the
-	 * update and fix their syscaches/relcaches.
-	 */
-	if (!IsBootstrapProcessingMode())
-		ImmediateInvalidateSharedHeapTuple(pg_class, tuple);
+		/* Keep the catalog indices up to date */
+		if (!IsIgnoringSystemIndexes())
+		{
+			Relation	idescs[Num_pg_class_indices];
+
+			CatalogOpenIndices(Num_pg_class_indices, Name_pg_class_indices,
+							   idescs);
+			CatalogIndexInsert(idescs, Num_pg_class_indices, pg_class, tuple);
+			CatalogCloseIndices(Num_pg_class_indices, idescs);
+		}
+	}
 
 	if (!pg_class_scan)
 		heap_freetuple(tuple);
 	else
 		heap_endscan(pg_class_scan);
 
-	heap_close(pg_class, NoLock);
-	heap_close(whichRel, NoLock);
+	heap_close(pg_class, RowExclusiveLock);
 }
 
 /* ----------------
@@ -1531,7 +1487,7 @@ setRelhasindexInplace(Oid relid, bool hasindex, bool immediate)
  * ----------------
  */
 void
-UpdateStats(Oid relid, long reltuples, bool inplace)
+UpdateStats(Oid relid, long reltuples)
 {
 	Relation	whichRel;
 	Relation	pg_class;
@@ -1573,6 +1529,7 @@ UpdateStats(Oid relid, long reltuples, bool inplace)
 	if (!RelationIsValid(whichRel))
 		elog(ERROR, "UpdateStats: cannot open relation id %u", relid);
 
+	/* Grab lock to be held till end of xact (probably redundant...) */
 	LockRelation(whichRel, ShareLock);
 
 	/* ----------------
@@ -1580,10 +1537,9 @@ UpdateStats(Oid relid, long reltuples, bool inplace)
 	 * ----------------
 	 */
 	pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
-	if (!RelationIsValid(pg_class))
-		elog(ERROR, "UpdateStats: could not open RELATION relation");
 
-	in_place_upd = (inplace || IsBootstrapProcessingMode());
+	in_place_upd = (IsReindexProcessing() || IsBootstrapProcessingMode());
+
 	if (!in_place_upd)
 	{
 		tuple = SearchSysCacheTupleCopy(RELOID,
@@ -1608,7 +1564,8 @@ UpdateStats(Oid relid, long reltuples, bool inplace)
 		if (pg_class_scan)
 			heap_endscan(pg_class_scan);
 		heap_close(pg_class, RowExclusiveLock);
-		elog(ERROR, "UpdateStats: cannot scan RELATION relation");
+		elog(ERROR, "UpdateStats: cannot find relation %u in pg_class",
+			 relid);
 	}
 
 	/* ----------------
@@ -1655,17 +1612,16 @@ UpdateStats(Oid relid, long reltuples, bool inplace)
 	 */
 	if (in_place_upd)
 	{
-
 		/*
 		 * At bootstrap time, we don't need to worry about concurrency or
-		 * visibility of changes, so we cheat.
+		 * visibility of changes, so we cheat.  Also cheat if REINDEX.
 		 */
-		if (!IsBootstrapProcessingMode())
-			ImmediateInvalidateSharedHeapTuple(pg_class, tuple);
 		rd_rel = (Form_pg_class) GETSTRUCT(tuple);
 		rd_rel->relpages = relpages;
 		rd_rel->reltuples = reltuples;
 		WriteNoReleaseBuffer(pg_class_scan->rs_cbuf);
+		if (!IsBootstrapProcessingMode())
+			RelationInvalidateHeapTuple(pg_class, tuple);
 	}
 	else
 	{
@@ -1700,7 +1656,7 @@ UpdateStats(Oid relid, long reltuples, bool inplace)
 
 	heap_close(pg_class, RowExclusiveLock);
 	/* Cheating a little bit since we didn't open it with heap_open... */
-	heap_close(whichRel, ShareLock);
+	heap_close(whichRel, NoLock);
 }
 
 
@@ -1868,18 +1824,16 @@ DefaultBuild(Relation heapRelation,
 	{
 		Oid			hrelid = RelationGetRelid(heapRelation);
 		Oid			irelid = RelationGetRelid(indexRelation);
-		bool		inplace = IsReindexProcessing();
 
 		heap_close(heapRelation, NoLock);
 		index_close(indexRelation);
-		UpdateStats(hrelid, reltuples, inplace);
-		UpdateStats(irelid, indtuples, inplace);
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, indtuples);
 		if (oldPred != NULL)
 		{
 			if (indtuples == reltuples)
 				predicate = NULL;
-			if (!inplace)
-				UpdateIndexPredicate(irelid, oldPred, predicate);
+			UpdateIndexPredicate(irelid, oldPred, predicate);
 		}
 	}
 }
@@ -1981,6 +1935,15 @@ reindex_index(Oid indexId, bool force)
 				accessMethodId;
 	bool		old;
 
+	/* ----------------
+	 *	REINDEX within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back we have no way to
+	 *	undo truncation of the index's physical file.  Disallow it.
+	 * ----------------
+	 */
+	if (IsTransactionBlock())
+		elog(ERROR, "REINDEX cannot run inside a BEGIN/END block");
+
 	old = SetReindexProcessing(true);
 
 	/* Scan pg_index to find the index's pg_index entry */
@@ -2024,7 +1987,7 @@ reindex_index(Oid indexId, bool force)
 	 * Release any buffers associated with this index.	If they're dirty,
 	 * they're just dropped without bothering to flush to disk.
 	 */
-	ReleaseRelationBuffers(iRel);
+	DropRelationBuffers(iRel);
 
 	/* Now truncate the actual data and set blocks to zero */
 	smgrtruncate(DEFAULT_SMGR, iRel, 0);
@@ -2056,7 +2019,7 @@ activate_indexes_of_a_table(Oid relid, bool activate)
 	if (IndexesAreActive(relid, true))
 	{
 		if (!activate)
-			setRelhasindexInplace(relid, false, true);
+			setRelhasindex(relid, false);
 		else
 			return false;
 	}
@@ -2117,7 +2080,7 @@ reindex_relation(Oid relid, bool force)
 	heap_endscan(scan);
 	heap_close(indexRelation, AccessShareLock);
 	if (reindexed)
-		setRelhasindexInplace(relid, true, false);
+		setRelhasindex(relid, true);
 	SetReindexProcessing(old);
 	return reindexed;
 }
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 5c176254d6..c02bafc322 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.58 2000/07/14 22:17:42 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.59 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,20 +34,14 @@
 #include "utils/builtins.h"
 #include "utils/syscache.h"
 
-static Relation copy_heap(Oid OIDOldHeap);
-static void copy_index(Oid OIDOldIndex, Oid OIDNewHeap);
+static Oid copy_heap(Oid OIDOldHeap, char *NewName);
+static void copy_index(Oid OIDOldIndex, Oid OIDNewHeap, char *NewIndexName);
 static void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
 
 /*
  * cluster
  *
- *	 Check that the relation is a relation in the appropriate user
- *	 ACL. I will use the same security that limits users on the
- *	 renamerel() function.
- *
- *	 Check that the index specified is appropriate for the task
- *	 ( ie it's an index over this relation ). This is trickier.
- *
+ * STILL TO DO:
  *	 Create a list of all the other indicies on this relation. Because
  *	 the cluster will wreck all the tids, I'll need to destroy bogus
  *	 indicies. The user will have to re-create them. Not nice, but
@@ -55,14 +49,6 @@ static void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
  *	 destroy re-build. This may be possible. I'll check out what the
  *	 index create functiond want in the way of paramaters. On the other
  *	 hand, re-creating n indicies may blow out the space.
- *
- *	 Create new (temporary) relations for the base heap and the new
- *	 index.
- *
- *	 Exclusively lock the relations.
- *
- *	 Create new clustered index and base heap relation.
- *
  */
 void
 cluster(char *oldrelname, char *oldindexname)
@@ -70,101 +56,93 @@ cluster(char *oldrelname, char *oldindexname)
 	Oid			OIDOldHeap,
 				OIDOldIndex,
 				OIDNewHeap;
-
 	Relation	OldHeap,
 				OldIndex;
-	Relation	NewHeap;
-
-	char		NewIndexName[NAMEDATALEN];
+	HeapTuple	tuple;
 	char		NewHeapName[NAMEDATALEN];
+	char		NewIndexName[NAMEDATALEN];
 	char		saveoldrelname[NAMEDATALEN];
 	char		saveoldindexname[NAMEDATALEN];
 
 	/*
-	 * Copy the arguments into local storage, because they are probably
-	 * in palloc'd storage that will go away when we commit a transaction.
+	 * Copy the arguments into local storage, just to be safe.
 	 */
-	strcpy(saveoldrelname, oldrelname);
-	strcpy(saveoldindexname, oldindexname);
+	StrNCpy(saveoldrelname, oldrelname, NAMEDATALEN);
+	StrNCpy(saveoldindexname, oldindexname, NAMEDATALEN);
 
 	/*
-	 * Like vacuum, cluster spans transactions, so I'm going to handle it
-	 * in the same way: commit and restart transactions where needed.
-	 *
 	 * We grab exclusive access to the target rel and index for the duration
-	 * of the initial transaction.
+	 * of the transaction.
 	 */
-
 	OldHeap = heap_openr(saveoldrelname, AccessExclusiveLock);
 	OIDOldHeap = RelationGetRelid(OldHeap);
 
-	OldIndex = index_openr(saveoldindexname); /* Open old index relation	*/
+	OldIndex = index_openr(saveoldindexname);
 	LockRelation(OldIndex, AccessExclusiveLock);
 	OIDOldIndex = RelationGetRelid(OldIndex);
 
 	/*
-	 * XXX Should check that index is in fact an index on this relation?
+	 * Check that index is in fact an index on the given relation
 	 */
-
-	heap_close(OldHeap, NoLock);/* do NOT give up the locks */
+	tuple = SearchSysCacheTuple(INDEXRELID,
+								ObjectIdGetDatum(OIDOldIndex),
+								0, 0, 0);
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "CLUSTER: no pg_index entry for index %u",
+			 OIDOldIndex);
+	if (((Form_pg_index) GETSTRUCT(tuple))->indrelid != OIDOldHeap)
+		elog(ERROR, "CLUSTER: \"%s\" is not an index for table \"%s\"",
+			 saveoldindexname, saveoldrelname);
+
+	/* Drop relcache refcnts, but do NOT give up the locks */
+	heap_close(OldHeap, NoLock);
 	index_close(OldIndex);
 
 	/*
-	 * I need to build the copies of the heap and the index. The Commit()
-	 * between here is *very* bogus. If someone is appending stuff, they
-	 * will get the lock after being blocked and add rows which won't be
-	 * present in the new table. Bleagh! I'd be best to try and ensure
-	 * that no-one's in the tables for the entire duration of this process
-	 * with a pg_vlock.  XXX Isn't the above comment now invalid?
+	 * Create the new heap with a temporary name.
 	 */
-	NewHeap = copy_heap(OIDOldHeap);
-	OIDNewHeap = RelationGetRelid(NewHeap);
-	strcpy(NewHeapName, RelationGetRelationName(NewHeap));
+	snprintf(NewHeapName, NAMEDATALEN, "temp_%u", OIDOldHeap);
+
+	OIDNewHeap = copy_heap(OIDOldHeap, NewHeapName);
 
 	/* To make the new heap visible (which is until now empty). */
 	CommandCounterIncrement();
 
+	/*
+	 * Copy the heap data into the new table in the desired order.
+	 */
 	rebuildheap(OIDNewHeap, OIDOldHeap, OIDOldIndex);
 
-	/* To flush the filled new heap (and the statistics about it). */
+	/* To make the new heap's data visible. */
 	CommandCounterIncrement();
 
 	/* Create new index over the tuples of the new heap. */
-	copy_index(OIDOldIndex, OIDNewHeap);
-	snprintf(NewIndexName, NAMEDATALEN, "temp_%x", OIDOldIndex);
+	snprintf(NewIndexName, NAMEDATALEN, "temp_%u", OIDOldIndex);
 
-	/*
-	 * make this really happen. Flush all the buffers. (Believe me, it is
-	 * necessary ... ended up in a mess without it.)
-	 */
-	CommitTransactionCommand();
-	StartTransactionCommand();
+	copy_index(OIDOldIndex, OIDNewHeap, NewIndexName);
+
+	CommandCounterIncrement();
 
 	/* Destroy old heap (along with its index) and rename new. */
 	heap_drop_with_catalog(saveoldrelname, allowSystemTableMods);
 
-	CommitTransactionCommand();
-	StartTransactionCommand();
+	CommandCounterIncrement();
 
 	renamerel(NewHeapName, saveoldrelname);
+
+	/* This one might be unnecessary, but let's be safe. */
+	CommandCounterIncrement();
+
 	renamerel(NewIndexName, saveoldindexname);
 }
 
-static Relation
-copy_heap(Oid OIDOldHeap)
+static Oid
+copy_heap(Oid OIDOldHeap, char *NewName)
 {
-	char		NewName[NAMEDATALEN];
 	TupleDesc	OldHeapDesc,
 				tupdesc;
 	Oid			OIDNewHeap;
-	Relation	NewHeap,
-				OldHeap;
-
-	/*
-	 * Create a new heap relation with a temporary name, which has the
-	 * same tuple description as the old one.
-	 */
-	snprintf(NewName, NAMEDATALEN, "temp_%x", OIDOldHeap);
+	Relation	OldHeap;
 
 	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
 	OldHeapDesc = RelationGetDescr(OldHeap);
@@ -173,7 +151,6 @@ copy_heap(Oid OIDOldHeap)
 	 * Need to make a copy of the tuple descriptor,
 	 * heap_create_with_catalog modifies it.
 	 */
-
 	tupdesc = CreateTupleDescCopy(OldHeapDesc);
 
 	OIDNewHeap = heap_create_with_catalog(NewName, tupdesc,
@@ -181,19 +158,15 @@ copy_heap(Oid OIDOldHeap)
 										  allowSystemTableMods);
 
 	if (!OidIsValid(OIDNewHeap))
-		elog(ERROR, "clusterheap: cannot create temporary heap relation\n");
+		elog(ERROR, "copy_heap: cannot create temporary heap relation");
 
-	/* XXX why are we bothering to do this: */
-	NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
-
-	heap_close(NewHeap, AccessExclusiveLock);
-	heap_close(OldHeap, AccessExclusiveLock);
+	heap_close(OldHeap, NoLock);
 
-	return NewHeap;
+	return OIDNewHeap;
 }
 
 static void
-copy_index(Oid OIDOldIndex, Oid OIDNewHeap)
+copy_index(Oid OIDOldIndex, Oid OIDNewHeap, char *NewIndexName)
 {
 	Relation	OldIndex,
 				NewHeap;
@@ -202,18 +175,17 @@ copy_index(Oid OIDOldIndex, Oid OIDNewHeap)
 	Form_pg_index Old_pg_index_Form;
 	Form_pg_class Old_pg_index_relation_Form;
 	IndexInfo  *indexInfo;
-	char	   *NewIndexName;
 
 	NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
 	OldIndex = index_open(OIDOldIndex);
 
 	/*
-	 * OK. Create a new (temporary) index for the one that's already here.
+	 * Create a new (temporary) index like the one that's already here.
 	 * To do this I get the info from pg_index, and add a new index with
 	 * a temporary name.
 	 */
 	Old_pg_index_Tuple = SearchSysCacheTupleCopy(INDEXRELID,
-							ObjectIdGetDatum(RelationGetRelid(OldIndex)),
+												 ObjectIdGetDatum(OIDOldIndex),
 												 0, 0, 0);
 	Assert(Old_pg_index_Tuple);
 	Old_pg_index_Form = (Form_pg_index) GETSTRUCT(Old_pg_index_Tuple);
@@ -221,15 +193,11 @@ copy_index(Oid OIDOldIndex, Oid OIDNewHeap)
 	indexInfo = BuildIndexInfo(Old_pg_index_Tuple);
 
 	Old_pg_index_relation_Tuple = SearchSysCacheTupleCopy(RELOID,
-							ObjectIdGetDatum(RelationGetRelid(OldIndex)),
+														  ObjectIdGetDatum(OIDOldIndex),
 														  0, 0, 0);
 	Assert(Old_pg_index_relation_Tuple);
 	Old_pg_index_relation_Form = (Form_pg_class) GETSTRUCT(Old_pg_index_relation_Tuple);
 
-	/* Set the name. */
-	NewIndexName = palloc(NAMEDATALEN); /* XXX */
-	snprintf(NewIndexName, NAMEDATALEN, "temp_%x", OIDOldIndex);
-
 	index_create(RelationGetRelationName(NewHeap),
 				 NewIndexName,
 				 indexInfo,
@@ -239,10 +207,10 @@ copy_index(Oid OIDOldIndex, Oid OIDNewHeap)
 				 Old_pg_index_Form->indisprimary,
 				 allowSystemTableMods);
 
-	setRelhasindexInplace(OIDNewHeap, true, false);
+	setRelhasindex(OIDNewHeap, true);
 
 	index_close(OldIndex);
-	heap_close(NewHeap, AccessExclusiveLock);
+	heap_close(NewHeap, NoLock);
 }
 
 
@@ -294,6 +262,6 @@ rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
 	index_endscan(ScanDesc);
 
 	index_close(LocalOldIndex);
-	heap_close(LocalOldHeap, AccessExclusiveLock);
-	heap_close(LocalNewHeap, AccessExclusiveLock);
+	heap_close(LocalOldHeap, NoLock);
+	heap_close(LocalNewHeap, NoLock);
 }
diff --git a/src/backend/commands/command.c b/src/backend/commands/command.c
index 4446c9f5cb..54b913dcac 100644
--- a/src/backend/commands/command.c
+++ b/src/backend/commands/command.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/command.c,v 1.108 2000/10/26 21:34:44 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/command.c,v 1.109 2000/11/08 22:09:57 tgl Exp $
  *
  * NOTES
  *	  The PerformAddAttribute() code, like most of the relation
@@ -1661,9 +1661,13 @@ AlterTableCreateToastTable(const char *relationName, bool silent)
 
 	/*
 	 * Update toast rel's pg_class entry to show that it has an index.
-	 * NOTE this also does CommandCounterIncrement() to make index visible.
 	 */
-	setRelhasindexInplace(toast_relid, true, false);
+	setRelhasindex(toast_relid, true);
+
+	/*
+	 * Make index visible
+	 */
+	CommandCounterIncrement();
 
 	/*
 	 * Get the OID of the newly created index
diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c
index 6dd3c4dfab..bff2b897c6 100644
--- a/src/backend/commands/comment.c
+++ b/src/backend/commands/comment.c
@@ -356,10 +356,8 @@ CommentAttribute(char *relname, char *attrname, char *comment)
 	attrtuple = SearchSysCacheTuple(ATTNAME, ObjectIdGetDatum(relation->rd_id),
 									PointerGetDatum(attrname), 0, 0);
 	if (!HeapTupleIsValid(attrtuple))
-	{
 		elog(ERROR, "'%s' is not an attribute of class '%s'",
 			 attrname, relname);
-	}
 	oid = attrtuple->t_data->t_oid;
 
 	/*** Call CreateComments() to create/drop the comments ***/
@@ -368,8 +366,7 @@ CommentAttribute(char *relname, char *attrname, char *comment)
 
 	/*** Now, close the heap relation and return ***/
 
-	heap_close(relation, AccessShareLock);
-
+	heap_close(relation, NoLock);
 }
 
 /*------------------------------------------------------------------
@@ -840,6 +837,5 @@ CommentTrigger(char *trigger, char *relname, char *comment)
 
 	heap_endscan(scan);
 	heap_close(pg_trigger, AccessShareLock);
-	heap_close(relation, AccessShareLock);
-
+	heap_close(relation, NoLock);
 }
diff --git a/src/backend/commands/creatinh.c b/src/backend/commands/creatinh.c
index b6485850eb..75fd047392 100644
--- a/src/backend/commands/creatinh.c
+++ b/src/backend/commands/creatinh.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/creatinh.c,v 1.64 2000/09/12 21:06:47 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/creatinh.c,v 1.65 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -149,6 +149,20 @@ DefineRelation(CreateStmt *stmt, char relkind)
 
 	StoreCatalogInheritance(relationId, inheritList);
 
+	/*
+	 * We must bump the command counter to make the newly-created relation
+	 * tuple visible for opening.
+	 */
+	CommandCounterIncrement();
+
+	/*
+	 * Open the new relation and acquire exclusive lock on it.  This isn't
+	 * really necessary for locking out other backends (since they can't
+	 * see the new rel anyway until we commit), but it keeps the lock manager
+	 * from complaining about deadlock risks.
+	 */
+	rel = heap_openr(relname, AccessExclusiveLock);
+
 	/*
 	 * Now add any newly specified column default values and CHECK
 	 * constraints to the new relation.  These are passed to us in the
@@ -181,25 +195,11 @@ DefineRelation(CreateStmt *stmt, char relkind)
 		rawDefaults = lappend(rawDefaults, rawEnt);
 	}
 
-	/* If no raw defaults and no constraints, nothing to do. */
-	if (rawDefaults == NIL && stmt->constraints == NIL)
-		return;
-
-	/*
-	 * We must bump the command counter to make the newly-created relation
-	 * tuple visible for opening.
-	 */
-	CommandCounterIncrement();
-
-	/*
-	 * Open the new relation.
-	 */
-	rel = heap_openr(relname, AccessExclusiveLock);
-
 	/*
-	 * Parse and add the defaults/constraints.
+	 * Parse and add the defaults/constraints, if any.
 	 */
-	AddRelationRawConstraints(rel, rawDefaults, stmt->constraints);
+	if (rawDefaults || stmt->constraints)
+		AddRelationRawConstraints(rel, rawDefaults, stmt->constraints);
 
 	/*
 	 * Clean up.  We keep lock on new relation (although it shouldn't be
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index f9d1a92c75..fff6d56975 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/indexcmds.c,v 1.39 2000/10/22 23:32:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/indexcmds.c,v 1.40 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -214,7 +214,7 @@ DefineIndex(char *heapRelationName,
 	 * backends to flush their relcache entries and in particular their
 	 * cached lists of the indexes for this relation.
 	 */
-	setRelhasindexInplace(relationId, true, false);
+	setRelhasindex(relationId, true);
 }
 
 
@@ -635,6 +635,15 @@ ReindexIndex(const char *name, bool force /* currently unused */ )
 {
 	HeapTuple	tuple;
 
+	/* ----------------
+	 *	REINDEX within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back we have no way to
+	 *	undo truncation of the index's physical file.  Disallow it.
+	 * ----------------
+	 */
+	if (IsTransactionBlock())
+		elog(ERROR, "REINDEX cannot run inside a BEGIN/END block");
+
 	tuple = SearchSysCacheTuple(RELNAME,
 								PointerGetDatum(name),
 								0, 0, 0);
@@ -666,6 +675,15 @@ ReindexTable(const char *name, bool force)
 {
 	HeapTuple	tuple;
 
+	/* ----------------
+	 *	REINDEX within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back we have no way to
+	 *	undo truncation of the index's physical file.  Disallow it.
+	 * ----------------
+	 */
+	if (IsTransactionBlock())
+		elog(ERROR, "REINDEX cannot run inside a BEGIN/END block");
+
 	tuple = SearchSysCacheTuple(RELNAME,
 								PointerGetDatum(name),
 								0, 0, 0);
diff --git a/src/backend/commands/rename.c b/src/backend/commands/rename.c
index 0f41cac1dc..6a9de4abf0 100644
--- a/src/backend/commands/rename.c
+++ b/src/backend/commands/rename.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/rename.c,v 1.51 2000/10/22 23:32:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/rename.c,v 1.52 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -183,16 +183,9 @@ renamerel(const char *oldrelname, const char *newrelname)
 	Oid			reloid;
 	char		relkind;
 	Relation	irelations[Num_pg_class_indices];
-#ifdef OLD_FILE_NAMING
-	int			i;
-	char		oldpath[MAXPGPATH],
-				newpath[MAXPGPATH],
-				toldpath[MAXPGPATH + 10],
-				tnewpath[MAXPGPATH + 10];
-#endif
 
 	if (!allowSystemTableMods && IsSystemRelationName(oldrelname))
-		elog(ERROR, "renamerel: system relation \"%s\" not renamed",
+		elog(ERROR, "renamerel: system relation \"%s\" may not be renamed",
 			 oldrelname);
 
 	if (!allowSystemTableMods && IsSystemRelationName(newrelname))
@@ -201,7 +194,7 @@ renamerel(const char *oldrelname, const char *newrelname)
 
 	/*
 	 * Check for renaming a temp table, which only requires altering
-	 * the temp-table mapping, not the physical table.
+	 * the temp-table mapping, not the underlying table.
 	 */
 	if (rename_temp_relation(oldrelname, newrelname))
 		return;					/* all done... */
@@ -213,7 +206,7 @@ renamerel(const char *oldrelname, const char *newrelname)
 	targetrelation = RelationNameGetRelation(oldrelname);
 
 	if (!RelationIsValid(targetrelation))
-		elog(ERROR, "Relation '%s' does not exist", oldrelname);
+		elog(ERROR, "Relation \"%s\" does not exist", oldrelname);
 
 	/*
 	 * Grab an exclusive lock on the target table, which we will NOT
@@ -221,46 +214,9 @@ renamerel(const char *oldrelname, const char *newrelname)
 	 */
 	LockRelation(targetrelation, AccessExclusiveLock);
 
-	/* ----------------
-	 *	RENAME TABLE within a transaction block is dangerous, because
-	 *	if the transaction is later rolled back we have no way to
-	 *	undo the rename of the relation's physical file.  For now, allow it
-	 *	but emit a warning message.
-	 *	Someday we might want to consider postponing the physical rename
-	 *	until transaction commit, but that's a lot of work...
-	 *	The only case that actually works right is for relations created
-	 *	in the current transaction, since the post-abort state would be that
-	 *	they don't exist anyway.  So, no warning in that case.
-	 * ----------------
-	 */
-	if (IsTransactionBlock() && !targetrelation->rd_myxactonly)
-		elog(NOTICE, "Caution: RENAME TABLE cannot be rolled back, so don't abort now");
-
 	reloid = RelationGetRelid(targetrelation);
 	relkind = targetrelation->rd_rel->relkind;
 
-	/*
-	 * Flush all blocks of the relation out of the buffer pool.  We need
-	 * this because the blocks are marked with the relation's name as well
-	 * as OID. If some backend tries to write a dirty buffer with
-	 * mdblindwrt after we've renamed the physical file, we'll be in big
-	 * trouble.
-	 *
-	 * Since we hold the exclusive lock on the relation, we don't have to
-	 * worry about more blocks being read in while we finish the rename.
-	 */
-	if (FlushRelationBuffers(targetrelation, (BlockNumber) 0) < 0)
-		elog(ERROR, "renamerel: unable to flush relation from buffer pool");
-
-	/*
-	 * Make sure smgr and lower levels close the relation's files. (Next
-	 * access to rel will reopen them.)
-	 *
-	 * Note: we rely on shared cache invalidation message to make other
-	 * backends close and re-open the files.
-	 */
-	smgrclose(DEFAULT_SMGR, targetrelation);
-
 	/*
 	 * Close rel, but keep exclusive lock!
 	 */
@@ -271,8 +227,9 @@ renamerel(const char *oldrelname, const char *newrelname)
 	 * the right instant).  It'll get rebuilt on next access to relation.
 	 *
 	 * XXX What if relation is myxactonly?
+	 *
+	 * XXX this is probably not necessary anymore?
 	 */
-	targetrelation = NULL;		/* make sure I don't touch it again */
 	RelationIdInvalidateRelationCacheByRelationId(reloid);
 
 	/*
@@ -291,7 +248,8 @@ renamerel(const char *oldrelname, const char *newrelname)
 		elog(ERROR, "renamerel: relation \"%s\" exists", newrelname);
 
 	/*
-	 * Update pg_class tuple with new relname.
+	 * Update pg_class tuple with new relname.  (Scribbling on oldreltup
+	 * is OK because it's a copy...)
 	 */
 	StrNCpy(NameStr(((Form_pg_class) GETSTRUCT(oldreltup))->relname),
 			newrelname, NAMEDATALEN);
@@ -310,36 +268,4 @@ renamerel(const char *oldrelname, const char *newrelname)
 	 */
 	if (relkind != RELKIND_INDEX)
 		TypeRename(oldrelname, newrelname);
-
-#ifdef OLD_FILE_NAMING
-	/*
-	 * Perform physical rename of files.  If this fails, we haven't yet
-	 * done anything irreversible.  NOTE that this MUST be the last step;
-	 * an error occurring afterwards would leave the relation hosed!
-	 *
-	 * XXX smgr.c ought to provide an interface for this; doing it directly
-	 * is bletcherous.
-	 */
-	strcpy(oldpath, relpath(oldrelname));
-	strcpy(newpath, relpath(newrelname));
-	if (rename(oldpath, newpath) < 0)
-		elog(ERROR, "renamerel: unable to rename %s to %s: %m",
-			 oldpath, newpath);
-
-	/* rename additional segments of relation, too */
-	for (i = 1;; i++)
-	{
-		sprintf(toldpath, "%s.%d", oldpath, i);
-		sprintf(tnewpath, "%s.%d", newpath, i);
-		if (rename(toldpath, tnewpath) < 0)
-		{
-			/* expected case is that there's not another segment file */
-			if (errno == ENOENT)
-				break;
-			/* otherwise we're up the creek... */
-			elog(ERROR, "renamerel: unable to rename %s to %s: %m",
-				 toldpath, tnewpath);
-		}
-	}
-#endif
 }
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 059bc42987..33340291e1 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.78 2000/10/16 17:08:05 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.79 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -388,6 +388,7 @@ RelationRemoveTriggers(Relation rel)
 	HeapScanDesc tgscan;
 	ScanKeyData key;
 	HeapTuple	tup;
+	bool		found = false;
 
 	tgrel = heap_openr(TriggerRelationName, RowExclusiveLock);
 	ScanKeyEntryInitialize(&key, 0, Anum_pg_trigger_tgrelid,
@@ -403,17 +404,44 @@ RelationRemoveTriggers(Relation rel)
 		DeleteComments(tup->t_data->t_oid);
 
 		heap_delete(tgrel, &tup->t_self, NULL);
+
+		found = true;
 	}
 
 	heap_endscan(tgscan);
 
 	/* ----------
-	 * Need to bump it here so the following doesn't see
-	 * the already deleted triggers again for a self-referencing
-	 * table.
+	 * If we deleted any triggers, must update pg_class entry and
+	 * advance command counter to make the updated entry visible.
+	 * This is fairly annoying, since we'e just going to drop the
+	 * durn thing later, but it's necessary to have a consistent
+	 * state in case we do CommandCounterIncrement() below ---
+	 * if RelationBuildTriggers() runs, it will complain otherwise.
+	 * Perhaps RelationBuildTriggers() shouldn't be so picky...
 	 * ----------
 	 */
-	CommandCounterIncrement();
+	if (found)
+	{
+		Relation	pgrel;
+		Relation	ridescs[Num_pg_class_indices];
+
+		pgrel = heap_openr(RelationRelationName, RowExclusiveLock);
+		tup = SearchSysCacheTupleCopy(RELOID,
+									  RelationGetRelid(rel),
+									  0, 0, 0);
+		if (!HeapTupleIsValid(tup))
+			elog(ERROR, "RelationRemoveTriggers: relation %u not found in pg_class",
+				 RelationGetRelid(rel));
+
+		((Form_pg_class) GETSTRUCT(tup))->reltriggers = 0;
+		heap_update(pgrel, &tup->t_self, tup, NULL);
+		CatalogOpenIndices(Num_pg_class_indices, Name_pg_class_indices, ridescs);
+		CatalogIndexInsert(ridescs, Num_pg_class_indices, pgrel, tup);
+		CatalogCloseIndices(Num_pg_class_indices, ridescs);
+		heap_freetuple(tup);
+		heap_close(pgrel, RowExclusiveLock);
+		CommandCounterIncrement();
+	}
 
 	/* ----------
 	 * Also drop all constraint triggers referencing this relation
@@ -431,22 +459,21 @@ RelationRemoveTriggers(Relation rel)
 
 		pg_trigger = (Form_pg_trigger) GETSTRUCT(tup);
 
-		refrel = heap_open(pg_trigger->tgrelid, NoLock);
+		stmt.trigname = pstrdup(NameStr(pg_trigger->tgname));
+
+		/* May as well grab AccessExclusiveLock, since DropTrigger will. */
+		refrel = heap_open(pg_trigger->tgrelid, AccessExclusiveLock);
 		stmt.relname = pstrdup(RelationGetRelationName(refrel));
 		heap_close(refrel, NoLock);
 
-		stmt.trigname = DatumGetCString(DirectFunctionCall1(nameout,
-						NameGetDatum(&pg_trigger->tgname)));
-
-
 		elog(NOTICE, "DROP TABLE implicitly drops referential integrity trigger from table \"%s\"", stmt.relname);
 
 		DropTrigger(&stmt);
 
 		/* ----------
 		 * Need to do a command counter increment here to show up
-		 * new pg_class.reltriggers in the next loop invocation already
-		 * (there are multiple referential integrity action
+		 * new pg_class.reltriggers in the next loop iteration
+		 * (in case there are multiple referential integrity action
 		 * triggers for the same FK table defined on the PK table).
 		 * ----------
 		 */
@@ -747,9 +774,6 @@ equalTriggerDescs(TriggerDesc *trigdesc1, TriggerDesc *trigdesc2)
 	 * We need not examine the "index" data, just the trigger array
 	 * itself; if we have the same triggers with the same types, the
 	 * derived index data should match.
-	 *
-	 * XXX It seems possible that the same triggers could appear in different
-	 * orders in the two trigger arrays; do we need to handle that?
 	 */
 	if (trigdesc1 != NULL)
 	{
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index cb47eda5c6..a396ec7871 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$Id: execAmi.c,v 1.54 2000/10/26 21:35:15 tgl Exp $
+ *	$Id: execAmi.c,v 1.55 2000/11/08 22:09:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -61,11 +61,8 @@ static Pointer ExecBeginScan(Relation relation, int nkeys, ScanKey skeys,
  *		  nkeys    -- number of keys
  *		  skeys    -- keys to restrict scanning
  *			 isindex  -- if this is true, the relation is the relid of
- *						 an index relation, else it is an index into the
- *						 range table.
+ *						 an index relation, else it is a heap relation.
  *		Returns the relation as(relDesc scanDesc)
- *		   If this structure is changed, need to modify the access macros
- *		defined in execInt.h.
  * ----------------------------------------------------------------
  */
 void
@@ -90,16 +87,19 @@ ExecOpenScanR(Oid relOid,
 	 */
 
 	/* ----------------
-	 *	open the relation with the correct call depending
+	 *	Open the relation with the correct call depending
 	 *	on whether this is a heap relation or an index relation.
 	 *
-	 *	Do not lock the rel here; beginscan will acquire AccessShareLock.
+	 *	For a table, acquire AccessShareLock for the duration of the query
+	 *	execution.  For indexes, acquire no lock here; the index machinery
+	 *	does its own locks and unlocks.  (We rely on having some kind of
+	 *	lock on the parent table to ensure the index won't go away!)
 	 * ----------------
 	 */
 	if (isindex)
 		relation = index_open(relOid);
 	else
-		relation = heap_open(relOid, NoLock);
+		relation = heap_open(relOid, AccessShareLock);
 
 	scanDesc = ExecBeginScan(relation,
 							 nkeys,
@@ -136,8 +136,6 @@ ExecBeginScan(Relation relation,
 {
 	Pointer		scanDesc;
 
-	scanDesc = NULL;
-
 	/* ----------------
 	 *	open the appropriate type of scan.
 	 *
@@ -183,12 +181,11 @@ ExecCloseR(Plan *node)
 	HeapScanDesc scanDesc;
 
 	/* ----------------
-	 *	shut down the heap scan and close the heap relation
+	 *	get state for node and shut down the heap scan, if any
 	 * ----------------
 	 */
 	switch (nodeTag(node))
 	{
-
 		case T_SeqScan:
 			state = ((SeqScan *) node)->scanstate;
 			break;
@@ -212,18 +209,9 @@ ExecCloseR(Plan *node)
 	if (scanDesc != NULL)
 		heap_endscan(scanDesc);
 
-	/*
-	 * endscan released AccessShareLock acquired by beginscan.	If we are
-	 * holding any stronger locks on the rel, they should be held till end
-	 * of xact.  Therefore, we need only close the rel and not release
-	 * locks.
-	 */
-	if (relation != NULL)
-		heap_close(relation, NoLock);
-
 	/* ----------------
 	 *	if this is an index scan then we have to take care
-	 *	of the index relations as well..
+	 *	of the index relations as well.
 	 * ----------------
 	 */
 	if (IsA(node, IndexScan))
@@ -242,7 +230,7 @@ ExecCloseR(Plan *node)
 		for (i = 0; i < numIndices; i++)
 		{
 			/* ----------------
-			 *	shut down each of the scans and
+			 *	shut down each of the index scans and
 			 *	close each of the index relations
 			 * ----------------
 			 */
@@ -253,6 +241,16 @@ ExecCloseR(Plan *node)
 				index_close(indexRelationDescs[i]);
 		}
 	}
+
+	/*
+	 * Finally, close the heap relation.
+	 *
+	 * Currently, we do not release the AccessShareLock acquired by
+	 * ExecOpenScanR.  This lock should be held till end of transaction.
+	 * (There is a faction that considers this too much locking, however.)
+	 */
+	if (relation != NULL)
+		heap_close(relation, NoLock);
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 93edc9f878..63f39f2e4c 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$Id: analyze.c,v 1.164 2000/11/05 01:42:07 tgl Exp $
+ *	$Id: analyze.c,v 1.165 2000/11/08 22:09:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -115,7 +115,7 @@ static void
 release_pstate_resources(ParseState *pstate)
 {
 	if (pstate->p_target_relation != NULL)
-		heap_close(pstate->p_target_relation, AccessShareLock);
+		heap_close(pstate->p_target_relation, NoLock);
 	pstate->p_target_relation = NULL;
 	pstate->p_target_rangetblentry = NULL;
 }
@@ -292,6 +292,7 @@ transformDeleteStmt(ParseState *pstate, DeleteStmt *stmt)
 	qry->commandType = CMD_DELETE;
 
 	/* set up a range table */
+	lockTargetTable(pstate, stmt->relname);
 	makeRangeTable(pstate, NIL);
 	setTargetTable(pstate, stmt->relname, stmt->inh, true);
 
@@ -331,6 +332,13 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 	qry->commandType = CMD_INSERT;
 	pstate->p_is_insert = true;
 
+	/*
+	 * Must get write lock on target table before scanning SELECT,
+	 * else we will grab the wrong kind of initial lock if the target
+	 * table is also mentioned in the SELECT part.
+	 */
+	lockTargetTable(pstate, stmt->relname);
+
 	/*
 	 * Is it INSERT ... SELECT or INSERT ... VALUES?
 	 */
@@ -1521,6 +1529,16 @@ transformRuleStmt(ParseState *pstate, RuleStmt *stmt)
 	qry->commandType = CMD_UTILITY;
 	qry->utilityStmt = (Node *) stmt;
 
+	/*
+	 * To avoid deadlock, make sure the first thing we do is grab
+	 * AccessExclusiveLock on the target relation.  This will be
+	 * needed by DefineQueryRewrite(), and we don't want to grab a lesser
+	 * lock beforehand.  We don't need to hold a refcount on the relcache
+	 * entry, however.
+	 */
+	heap_close(heap_openr(stmt->object->relname, AccessExclusiveLock),
+			   NoLock);
+
 	/*
 	 * NOTE: 'OLD' must always have a varno equal to 1 and 'NEW'
 	 * equal to 2.  Set up their RTEs in the main pstate for use
@@ -1727,6 +1745,9 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 		qry->isBinary = FALSE;
 	}
 
+	/* make FOR UPDATE clause available to addRangeTableEntry */
+	pstate->p_forUpdate = stmt->forUpdate;
+
 	/* set up a range table */
 	makeRangeTable(pstate, stmt->fromClause);
 
@@ -1765,7 +1786,7 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->rtable = pstate->p_rtable;
 	qry->jointree = makeFromExpr(pstate->p_joinlist, qual);
 
-	if (stmt->forUpdate != NULL)
+	if (stmt->forUpdate != NIL)
 		transformForUpdate(qry, stmt->forUpdate);
 
 	return qry;
@@ -1951,7 +1972,7 @@ transformSetOperationStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->rtable = pstate->p_rtable;
 	qry->jointree = makeFromExpr(pstate->p_joinlist, NULL);
 
-	if (forUpdate != NULL)
+	if (forUpdate != NIL)
 		transformForUpdate(qry, forUpdate);
 
 	return qry;
@@ -2159,6 +2180,7 @@ transformUpdateStmt(ParseState *pstate, UpdateStmt *stmt)
 	 * used in FROM, we'd fail to notice that it should be marked
 	 * checkForRead as well as checkForWrite.  See setTargetTable().
 	 */
+	lockTargetTable(pstate, stmt->relname);
 	makeRangeTable(pstate, stmt->fromClause);
 	setTargetTable(pstate, stmt->relname, stmt->inh, true);
 
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 4fbc628c58..5572828d25 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.207 2000/11/08 21:28:06 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.208 2000/11/08 22:09:58 tgl Exp $
  *
  * HISTORY
  *	  AUTHOR			DATE			MAJOR EVENT
@@ -334,7 +334,7 @@ static void doNegateFloat(Value *v);
  * when some sort of pg_privileges relation is introduced.
  * - Todd A. Brandys 1998-01-01?
  */
-%token	ABORT_TRANS, ACCESS, AFTER, AGGREGATE, ANALYZE, ANALYSE /* British */
+%token	ABORT_TRANS, ACCESS, AFTER, AGGREGATE, ANALYZE, ANALYSE,
 		BACKWARD, BEFORE, BINARY, BIT,
 		CACHE, CHECKPOINT, CLUSTER, COMMENT, COPY, CREATEDB, CREATEUSER, CYCLE,
 		DATABASE, DELIMITERS, DO,
@@ -2466,11 +2466,7 @@ ExtendStmt:  EXTEND INDEX index_name where_clause
 /* NOT USED
 RecipeStmt:  EXECUTE RECIPE recipe_name
 				{
-					RecipeStmt *n;
-					if (!IsTransactionBlock())
-						elog(ERROR,"EXECUTE RECIPE may only be used in begin/end transaction blocks");
-
-					n = makeNode(RecipeStmt);
+					RecipeStmt *n = makeNode(RecipeStmt);
 					n->recipeName = $3;
 					$$ = (Node *)n;
 				}
@@ -2633,8 +2629,6 @@ oper_argtypes:	Typename
 ReindexStmt:  REINDEX reindex_type name opt_force
 				{
 					ReindexStmt *n = makeNode(ReindexStmt);
-					if (IsTransactionBlock())
-						elog(ERROR,"REINDEX command could only be used outside begin/end transaction blocks");
 					n->reindexType = $2;
 					n->name = $3;
 					n->force = $4;
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index 38dc3ea097..60521d1347 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.70 2000/10/07 00:58:17 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_clause.c,v 1.71 2000/11/08 22:09:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -87,6 +87,34 @@ makeRangeTable(ParseState *pstate, List *frmList)
 	}
 }
 
+/*
+ * lockTargetTable
+ *	  Find the target relation of INSERT/UPDATE/DELETE and acquire write
+ *	  lock on it.  This must be done before building the range table,
+ *	  in case the target is also mentioned as a source relation --- we
+ *	  want to be sure to grab the write lock before any read lock.
+ *
+ * The ParseState's link to the target relcache entry is also set here.
+ */
+void
+lockTargetTable(ParseState *pstate, char *relname)
+{
+	/* Close old target; this could only happen for multi-action rules */
+	if (pstate->p_target_relation != NULL)
+		heap_close(pstate->p_target_relation, NoLock);
+	pstate->p_target_relation = NULL;
+	pstate->p_target_rangetblentry = NULL; /* setTargetTable will set this */
+
+	/*
+	 * Open target rel and grab suitable lock (which we will hold till
+	 * end of transaction).
+	 *
+	 * analyze.c will eventually do the corresponding heap_close(),
+	 * but *not* release the lock.
+	 */
+	pstate->p_target_relation = heap_openr(relname, RowExclusiveLock);
+}
+
 /*
  * setTargetTable
  *	  Add the target relation of INSERT/UPDATE/DELETE to the range table,
@@ -133,13 +161,10 @@ setTargetTable(ParseState *pstate, char *relname, bool inh, bool inJoinSet)
 	if (inJoinSet)
 		addRTEtoJoinList(pstate, rte);
 
-	/* This could only happen for multi-action rules */
-	if (pstate->p_target_relation != NULL)
-		heap_close(pstate->p_target_relation, AccessShareLock);
+	/* lockTargetTable should have been called earlier */
+	Assert(pstate->p_target_relation != NULL);
 
 	pstate->p_target_rangetblentry = rte;
-	pstate->p_target_relation = heap_open(rte->relid, AccessShareLock);
-	/* will close relation later, see analyze.c */
 }
 
 
diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index 3fccd95cb1..984485f9b4 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_relation.c,v 1.49 2000/09/29 18:21:36 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/parser/parse_relation.c,v 1.50 2000/11/08 22:09:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,6 +34,7 @@ static Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte,
 							  char *colname);
 static Node *scanJoinForColumn(JoinExpr *join, char *colname,
 							   int sublevels_up);
+static bool isForUpdate(ParseState *pstate, char *relname);
 static List *expandNamesVars(ParseState *pstate, List *names, List *vars);
 static void warnAutoRange(ParseState *pstate, char *refname);
 
@@ -477,6 +478,7 @@ addRangeTableEntry(ParseState *pstate,
 				   bool inFromCl)
 {
 	char	   *refname = alias ? alias->relname : relname;
+	LOCKMODE	lockmode;
 	Relation	rel;
 	RangeTblEntry *rte;
 	Attr	   *eref;
@@ -502,17 +504,22 @@ addRangeTableEntry(ParseState *pstate,
 
 	/*
 	 * Get the rel's OID.  This access also ensures that we have an
-	 * up-to-date relcache entry for the rel.  We don't need to keep it
-	 * open, however. Since this is open anyway, let's check that the
-	 * number of column aliases is reasonable. - Thomas 2000-02-04
+	 * up-to-date relcache entry for the rel.  Since this is typically
+	 * the first access to a rel in a statement, be careful to get the
+	 * right access level depending on whether we're doing SELECT FOR UPDATE.
 	 */
-	rel = heap_openr(relname, AccessShareLock);
+	lockmode = isForUpdate(pstate, relname) ? RowShareLock : AccessShareLock;
+	rel = heap_openr(relname, lockmode);
 	rte->relid = RelationGetRelid(rel);
-	maxattrs = RelationGetNumberOfAttributes(rel);
 
 	eref = alias ? (Attr *) copyObject(alias) : makeAttr(refname, NULL);
 	numaliases = length(eref->attrs);
 
+	/*
+	 * Since the rel is open anyway, let's check that the
+	 * number of column aliases is reasonable. - Thomas 2000-02-04
+	 */
+	maxattrs = RelationGetNumberOfAttributes(rel);
 	if (maxattrs < numaliases)
 		elog(ERROR, "Table \"%s\" has %d columns available but %d columns specified",
 			 refname, maxattrs, numaliases);
@@ -527,7 +534,12 @@ addRangeTableEntry(ParseState *pstate,
 	}
 	rte->eref = eref;
 
-	heap_close(rel, AccessShareLock);
+	/*
+	 * Drop the rel refcount, but keep the access lock till end of transaction
+	 * so that the table can't be deleted or have its schema modified
+	 * underneath us.
+	 */
+	heap_close(rel, NoLock);
 
 	/*----------
 	 * Flags:
@@ -643,6 +655,41 @@ addRangeTableEntryForSubquery(ParseState *pstate,
 	return rte;
 }
 
+/*
+ * Has the specified relname been selected FOR UPDATE?
+ */
+static bool
+isForUpdate(ParseState *pstate, char *relname)
+{
+	/* Outer loop to check parent query levels as well as this one */
+	while (pstate != NULL)
+	{
+		if (pstate->p_forUpdate != NIL)
+		{
+			if (lfirst(pstate->p_forUpdate) == NULL)
+			{
+				/* all tables used in query */
+				return true;
+			}
+			else
+			{
+				/* just the named tables */
+				List   *l;
+
+				foreach(l, pstate->p_forUpdate)
+				{
+					char	   *rname = lfirst(l);
+
+					if (strcmp(relname, rname) == 0)
+						return true;
+				}
+			}
+		}
+		pstate = pstate->parentParseState;
+	}
+	return false;
+}
+
 /*
  * Add the given RTE as a top-level entry in the pstate's join list,
  * unless there already is an entry for it.
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index d0fe6a5ee1..a0a9d5671e 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/rewrite/rewriteHandler.c,v 1.82 2000/10/05 19:11:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/rewrite/rewriteHandler.c,v 1.83 2000/11/08 22:09:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -363,14 +363,6 @@ static Query *
 fireRIRrules(Query *parsetree)
 {
 	int			rt_index;
-	RangeTblEntry *rte;
-	Relation	rel;
-	List	   *locks;
-	RuleLock   *rules;
-	RewriteRule *rule;
-	bool		relIsUsed;
-	int			i;
-	List	   *l;
 
 	/*
 	 * don't try to convert this into a foreach loop, because rtable list
@@ -379,6 +371,16 @@ fireRIRrules(Query *parsetree)
 	rt_index = 0;
 	while (rt_index < length(parsetree->rtable))
 	{
+		RangeTblEntry *rte;
+		Relation	rel;
+		List	   *locks;
+		RuleLock   *rules;
+		RewriteRule *rule;
+		LOCKMODE	lockmode;
+		bool		relIsUsed;
+		int			i;
+		List	   *l;
+
 		++rt_index;
 
 		rte = rt_fetch(rt_index, parsetree->rtable);
@@ -406,11 +408,32 @@ fireRIRrules(Query *parsetree)
 		if (!relIsUsed && rt_index != parsetree->resultRelation)
 			continue;
 
-		rel = heap_openr(rte->relname, AccessShareLock);
+		/*
+		 * This may well be the first access to the relation during
+		 * the current statement (it will be, if this Query was extracted
+		 * from a rule or somehow got here other than via the parser).
+		 * Therefore, grab the appropriate lock type for the relation,
+		 * and do not release it until end of transaction.  This protects
+		 * the rewriter and planner against schema changes mid-query.
+		 *
+		 * If the relation is the query's result relation, then RewriteQuery()
+		 * already got the right lock on it, so we need no additional lock.
+		 * Otherwise, check to see if the relation is accessed FOR UPDATE
+		 * or not.
+		 */
+		if (rt_index == parsetree->resultRelation)
+			lockmode = NoLock;
+		else if (intMember(rt_index, parsetree->rowMarks))
+			lockmode = RowShareLock;
+		else
+			lockmode = AccessShareLock;
+
+		rel = heap_openr(rte->relname, lockmode);
+
 		rules = rel->rd_rules;
 		if (rules == NULL)
 		{
-			heap_close(rel, AccessShareLock);
+			heap_close(rel, NoLock);
 			continue;
 		}
 
@@ -450,7 +473,7 @@ fireRIRrules(Query *parsetree)
 										  relIsUsed);
 		}
 
-		heap_close(rel, AccessShareLock);
+		heap_close(rel, NoLock);
 	}
 
 	/*
@@ -761,8 +784,19 @@ RewriteQuery(Query *parsetree, bool *instead_flag, List **qual_products)
 	 * the statement is an update, insert or delete - fire rules on it.
 	 */
 	result_relation = parsetree->resultRelation;
+	Assert(result_relation != 0);
 	rt_entry = rt_fetch(result_relation, parsetree->rtable);
-	rt_entry_relation = heap_openr(rt_entry->relname, AccessShareLock);
+
+	/*
+	 * This may well be the first access to the result relation during
+	 * the current statement (it will be, if this Query was extracted
+	 * from a rule or somehow got here other than via the parser).
+	 * Therefore, grab the appropriate lock type for a result relation,
+	 * and do not release it until end of transaction.  This protects the
+	 * rewriter and planner against schema changes mid-query.
+	 */
+	rt_entry_relation = heap_openr(rt_entry->relname, RowExclusiveLock);
+
 	rt_entry_locks = rt_entry_relation->rd_rules;
 
 	if (rt_entry_locks != NULL)
@@ -778,7 +812,7 @@ RewriteQuery(Query *parsetree, bool *instead_flag, List **qual_products)
 									qual_products);
 	}
 
-	heap_close(rt_entry_relation, AccessShareLock);
+	heap_close(rt_entry_relation, NoLock); /* keep lock! */
 
 	return product_queries;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9c9bda5035..8d40e8d952 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.92 2000/10/28 16:20:55 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.93 2000/11/08 22:09:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -709,23 +709,28 @@ refcount = %ld, file: %s, line: %d\n",
 #endif
 
 /*
- * FlushBuffer -- like WriteBuffer, but force the page to disk.
+ * FlushBuffer -- like WriteBuffer, but write the page immediately,
+ * rather than just marking it dirty.  On success return, the buffer will
+ * no longer be dirty.
  *
  * 'buffer' is known to be dirty/pinned, so there should not be a
  * problem reading the BufferDesc members without the BufMgrLock
  * (nobody should be able to change tags out from under us).
  *
- * Unpin if 'release' is TRUE.
+ * If 'sync' is true, a synchronous write is wanted (wait for buffer to hit
+ * the disk).  Otherwise it's sufficient to issue the kernel write call.
+ *
+ * Unpin buffer if 'release' is true.
  */
 int
-FlushBuffer(Buffer buffer, bool release)
+FlushBuffer(Buffer buffer, bool sync, bool release)
 {
 	BufferDesc *bufHdr;
 	Relation	bufrel;
 	int			status;
 
 	if (BufferIsLocal(buffer))
-		return FlushLocalBuffer(buffer, release) ? STATUS_OK : STATUS_ERROR;
+		return FlushLocalBuffer(buffer, sync, release) ? STATUS_OK : STATUS_ERROR;
 
 	if (BAD_BUFFER_ID(buffer))
 		return STATUS_ERROR;
@@ -755,12 +760,16 @@ FlushBuffer(Buffer buffer, bool release)
 	 */
 	LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE);
 
-	status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
-					   (char *) MAKE_PTR(bufHdr->data));
+	if (sync)
+		status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+						   (char *) MAKE_PTR(bufHdr->data));
+	else
+		status = smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+						   (char *) MAKE_PTR(bufHdr->data));
 
 	LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK);
 
-	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
 	RelationDecrementReferenceCount(bufrel);
 
 	if (status == SM_FAIL)
@@ -926,7 +935,7 @@ SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
 
 			/*
 			 * drop relcache refcnt incremented by
-			 * RelationIdCacheGetRelation
+			 * RelationNodeCacheGetRelation
 			 */
 			RelationDecrementReferenceCount(reln);
 		}
@@ -1123,7 +1132,7 @@ BufferSync()
 						bufHdr->flags &= ~BM_DIRTY;
 				}
 
-				/* drop refcnt obtained by RelationIdCacheGetRelation */
+				/* drop refcnt obtained by RelationNodeCacheGetRelation */
 				if (reln != (Relation) NULL)
 					RelationDecrementReferenceCount(reln);
 			}
@@ -1154,7 +1163,7 @@ BufferSync()
 
 				/*
 				 * drop relcache refcnt incremented by
-				 * RelationIdCacheGetRelation
+				 * RelationNodeCacheGetRelation
 				 */
 				RelationDecrementReferenceCount(reln);
 
@@ -1458,7 +1467,7 @@ BufferReplace(BufferDesc *bufHdr)
 
 	SpinAcquire(BufMgrLock);
 
-	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);
 
@@ -1495,21 +1504,23 @@ RelationGetNumberOfBlocks(Relation relation)
 }
 
 /* ---------------------------------------------------------------------
- *		ReleaseRelationBuffers
+ *		DropRelationBuffers
  *
  *		This function removes all the buffered pages for a relation
  *		from the buffer pool.  Dirty pages are simply dropped, without
- *		bothering to write them out first.  This is used when the
- *		relation is about to be deleted.  We assume that the caller
- *		holds an exclusive lock on the relation, which should assure
- *		that no new buffers will be acquired for the rel meanwhile.
+ *		bothering to write them out first.  This is NOT rollback-able,
+ *		and so should be used only with extreme caution!
+ *
+ *		We assume that the caller holds an exclusive lock on the relation,
+ *		which should assure that no new buffers will be acquired for the rel
+ *		meanwhile.
  *
  *		XXX currently it sequentially searches the buffer pool, should be
  *		changed to more clever ways of searching.
  * --------------------------------------------------------------------
  */
 void
-ReleaseRelationBuffers(Relation rel)
+DropRelationBuffers(Relation rel)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -1589,7 +1600,104 @@ recheck:
 		 * this rel, since we hold exclusive lock on this rel.
 		 */
 		if (RelFileNodeEquals(rel->rd_node, 
-					  BufferTagLastDirtied[i - 1].rnode))
+							  BufferTagLastDirtied[i - 1].rnode))
+			BufferDirtiedByMe[i - 1] = false;
+	}
+
+	SpinRelease(BufMgrLock);
+}
+
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodeBuffers
+ *
+ *		This is the same as DropRelationBuffers, except that the target
+ *		relation is specified by RelFileNode.
+ *
+ *		This is NOT rollback-able.  One legitimate use is to clear the
+ *		buffer cache of buffers for a relation that is being deleted
+ *		during transaction abort.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeBuffers(RelFileNode rnode)
+{
+	int			i;
+	BufferDesc *bufHdr;
+
+	/* We have to search both local and shared buffers... */
+
+	for (i = 0; i < NLocBuffer; i++)
+	{
+		bufHdr = &LocalBufferDescriptors[i];
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+			LocalRefCount[i] = 0;
+			bufHdr->tag.rnode.relNode = InvalidOid;
+		}
+	}
+
+	SpinAcquire(BufMgrLock);
+	for (i = 1; i <= NBuffers; i++)
+	{
+		bufHdr = &BufferDescriptors[i - 1];
+recheck:
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+
+			/*
+			 * If there is I/O in progress, better wait till it's done;
+			 * don't want to delete the relation out from under someone
+			 * who's just trying to flush the buffer!
+			 */
+			if (bufHdr->flags & BM_IO_IN_PROGRESS)
+			{
+				WaitIO(bufHdr, BufMgrLock);
+
+				/*
+				 * By now, the buffer very possibly belongs to some other
+				 * rel, so check again before proceeding.
+				 */
+				goto recheck;
+			}
+			/* Now we can do what we came for */
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+
+			/*
+			 * Release any refcount we may have.
+			 *
+			 * This is very probably dead code, and if it isn't then it's
+			 * probably wrong.	I added the Assert to find out --- tgl
+			 * 11/99.
+			 */
+			if (!(bufHdr->flags & BM_FREE))
+			{
+				/* Assert checks that buffer will actually get freed! */
+				Assert(PrivateRefCount[i - 1] == 1 &&
+					   bufHdr->refcount == 1);
+				/* ReleaseBuffer expects we do not hold the lock at entry */
+				SpinRelease(BufMgrLock);
+				ReleaseBuffer(i);
+				SpinAcquire(BufMgrLock);
+			}
+			/*
+			 * And mark the buffer as no longer occupied by this rel.
+			 */
+			BufTableDelete(bufHdr);
+		}
+
+		/*
+		 * Also check to see if BufferDirtiedByMe info for this buffer
+		 * refers to the target relation, and clear it if so.  This is
+		 * independent of whether the current contents of the buffer
+		 * belong to the target relation!
+		 *
+		 * NOTE: we have no way to clear BufferDirtiedByMe info in other
+		 * backends, but hopefully there are none with that bit set for
+		 * this rel, since we hold exclusive lock on this rel.
+		 */
+		if (RelFileNodeEquals(rnode, 
+							  BufferTagLastDirtied[i - 1].rnode))
 			BufferDirtiedByMe[i - 1] = false;
 	}
 
@@ -1604,7 +1712,7 @@ recheck:
  *		bothering to write them out first.  This is used when we destroy a
  *		database, to avoid trying to flush data to disk when the directory
  *		tree no longer exists.	Implementation is pretty similar to
- *		ReleaseRelationBuffers() which is for destroying just one relation.
+ *		DropRelationBuffers() which is for destroying just one relation.
  * --------------------------------------------------------------------
  */
 void
@@ -1757,33 +1865,32 @@ BufferPoolBlowaway()
 /* ---------------------------------------------------------------------
  *		FlushRelationBuffers
  *
- *		This function flushes all dirty pages of a relation out to disk.
+ *		This function writes all dirty pages of a relation out to disk.
  *		Furthermore, pages that have blocknumber >= firstDelBlock are
  *		actually removed from the buffer pool.  An error code is returned
  *		if we fail to dump a dirty buffer or if we find one of
  *		the target pages is pinned into the cache.
  *
- *		This is used by VACUUM before truncating the relation to the given
- *		number of blocks.  (TRUNCATE TABLE also uses it in the same way.)
- *		It might seem unnecessary to flush dirty pages before firstDelBlock,
- *		since VACUUM should already have committed its changes.  However,
- *		it is possible for there still to be dirty pages: if some page
- *		had unwritten on-row tuple status updates from a prior transaction,
- *		and VACUUM had no additional changes to make to that page, then
- *		VACUUM won't have written it.  This is harmless in most cases but
- *		will break pg_upgrade, which relies on VACUUM to ensure that *all*
- *		tuples have correct on-row status.  So, we check and flush all
- *		dirty pages of the rel regardless of block number.
- *
- *		This is also used by RENAME TABLE (with firstDelBlock = 0)
- *		to clear out the buffer cache before renaming the physical files of
- *		a relation.  Without that, some other backend might try to do a
- *		blind write of a buffer page (relying on the BlindId of the buffer)
- *		and fail because it's not got the right filename anymore.
+ *		This is called by DROP TABLE to clear buffers for the relation
+ *		from the buffer pool.  Note that we must write dirty buffers,
+ *		rather than just dropping the changes, because our transaction
+ *		might abort later on; we want to roll back safely in that case.
+ *
+ *		This is also called by VACUUM before truncating the relation to the
+ *		given number of blocks.  It might seem unnecessary for VACUUM to
+ *		write dirty pages before firstDelBlock, since VACUUM should already
+ *		have committed its changes.  However, it is possible for there still
+ *		to be dirty pages: if some page had unwritten on-row tuple status
+ *		updates from a prior transaction, and VACUUM had no additional
+ *		changes to make to that page, then VACUUM won't have written it.
+ *		This is harmless in most cases but will break pg_upgrade, which
+ *		relies on VACUUM to ensure that *all* tuples have correct on-row
+ *		status.  So, we check and flush all dirty pages of the rel
+ *		regardless of block number.
  *
  *		In all cases, the caller should be holding AccessExclusiveLock on
  *		the target relation to ensure that no other backend is busy reading
- *		more blocks of the relation.
+ *		more blocks of the relation (or might do so before we commit).
  *
  *		Formerly, we considered it an error condition if we found dirty
  *		buffers here.	However, since BufferSync no longer forces out all
@@ -1812,7 +1919,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 			{
 				if (bufHdr->flags & BM_DIRTY)
 				{
-					if (FlushBuffer(-i - 1, false) != STATUS_OK)
+					if (FlushBuffer(-i - 1, false, false) != STATUS_OK)
 					{
 						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
 							 RelationGetRelationName(rel), firstDelBlock,
@@ -1840,15 +1947,17 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 	for (i = 0; i < NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i];
-recheck:
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 		{
 			if (bufHdr->flags & BM_DIRTY)
 			{
 				PinBuffer(bufHdr);
 				SpinRelease(BufMgrLock);
-				if (FlushBuffer(i + 1, true) != STATUS_OK)
+				if (FlushBuffer(i + 1, false, false) != STATUS_OK)
 				{
+					SpinAcquire(BufMgrLock);
+					UnpinBuffer(bufHdr);
+					SpinRelease(BufMgrLock);
 					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
 						 RelationGetRelationName(rel), firstDelBlock,
 						 bufHdr->tag.blockNum,
@@ -1856,12 +1965,7 @@ recheck:
 					return -1;
 				}
 				SpinAcquire(BufMgrLock);
-
-				/*
-				 * Buffer could already be reassigned, so must recheck
-				 * whether it still belongs to rel before freeing it!
-				 */
-				goto recheck;
+				UnpinBuffer(bufHdr);
 			}
 			if (!(bufHdr->flags & BM_FREE))
 			{
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index faa3304b4f..352f519bdc 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -16,7 +16,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.33 2000/10/28 16:20:56 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.34 2000/11/08 22:09:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -183,7 +183,7 @@ WriteLocalBuffer(Buffer buffer, bool release)
  *	  flushes a local buffer
  */
 int
-FlushLocalBuffer(Buffer buffer, bool release)
+FlushLocalBuffer(Buffer buffer, bool sync, bool release)
 {
 	int			bufid;
 	Relation	bufrel;
@@ -199,13 +199,18 @@ FlushLocalBuffer(Buffer buffer, bool release)
 	bufHdr = &LocalBufferDescriptors[bufid];
 	bufHdr->flags &= ~BM_DIRTY;
 	bufrel = RelationNodeCacheGetRelation(bufHdr->tag.rnode);
-
 	Assert(bufrel != NULL);
-	smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
-			  (char *) MAKE_PTR(bufHdr->data));
+
+	if (sync)
+		smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+				  (char *) MAKE_PTR(bufHdr->data));
+	else
+		smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+				  (char *) MAKE_PTR(bufHdr->data));
+
 	LocalBufferFlushCount++;
 
-	/* drop relcache refcount incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcount incremented by RelationNodeCacheGetRelation */
 	RelationDecrementReferenceCount(bufrel);
 
 	if (release)
diff --git a/src/backend/storage/buffer/xlog_bufmgr.c b/src/backend/storage/buffer/xlog_bufmgr.c
index dcd377b7eb..15c4321405 100644
--- a/src/backend/storage/buffer/xlog_bufmgr.c
+++ b/src/backend/storage/buffer/xlog_bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.1 2000/10/28 16:20:56 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.2 2000/11/08 22:09:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -838,7 +838,7 @@ BufferSync()
 
 		SpinRelease(BufMgrLock);
 
-		/* drop refcnt obtained by RelationIdCacheGetRelation */
+		/* drop refcnt obtained by RelationNodeCacheGetRelation */
 		if (reln != (Relation) NULL)
 		{
 			RelationDecrementReferenceCount(reln);
@@ -1128,7 +1128,7 @@ BufferReplace(BufferDesc *bufHdr)
 							  false);	/* no fsync */
 	}
 
-	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);
 
@@ -1159,21 +1159,23 @@ RelationGetNumberOfBlocks(Relation relation)
 }
 
 /* ---------------------------------------------------------------------
- *		ReleaseRelationBuffers
+ *		DropRelationBuffers
  *
  *		This function removes all the buffered pages for a relation
  *		from the buffer pool.  Dirty pages are simply dropped, without
- *		bothering to write them out first.  This is used when the
- *		relation is about to be deleted.  We assume that the caller
- *		holds an exclusive lock on the relation, which should assure
- *		that no new buffers will be acquired for the rel meanwhile.
+ *		bothering to write them out first.  This is NOT rollback-able,
+ *		and so should be used only with extreme caution!
+ *
+ *		We assume that the caller holds an exclusive lock on the relation,
+ *		which should assure that no new buffers will be acquired for the rel
+ *		meanwhile.
  *
  *		XXX currently it sequentially searches the buffer pool, should be
  *		changed to more clever ways of searching.
  * --------------------------------------------------------------------
  */
 void
-ReleaseRelationBuffers(Relation rel)
+DropRelationBuffers(Relation rel)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -1248,6 +1250,91 @@ recheck:
 	SpinRelease(BufMgrLock);
 }
 
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodeBuffers
+ *
+ *		This is the same as DropRelationBuffers, except that the target
+ *		relation is specified by RelFileNode.
+ *
+ *		This is NOT rollback-able.  One legitimate use is to clear the
+ *		buffer cache of buffers for a relation that is being deleted
+ *		during transaction abort.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeBuffers(RelFileNode rnode)
+{
+	int			i;
+	BufferDesc *bufHdr;
+
+	/* We have to search both local and shared buffers... */
+
+	for (i = 0; i < NLocBuffer; i++)
+	{
+		bufHdr = &LocalBufferDescriptors[i];
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+			bufHdr->cntxDirty = false;
+			LocalRefCount[i] = 0;
+			bufHdr->tag.rnode.relNode = InvalidOid;
+		}
+	}
+
+	SpinAcquire(BufMgrLock);
+	for (i = 1; i <= NBuffers; i++)
+	{
+		bufHdr = &BufferDescriptors[i - 1];
+recheck:
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+
+			/*
+			 * If there is I/O in progress, better wait till it's done;
+			 * don't want to delete the relation out from under someone
+			 * who's just trying to flush the buffer!
+			 */
+			if (bufHdr->flags & BM_IO_IN_PROGRESS)
+			{
+				WaitIO(bufHdr, BufMgrLock);
+
+				/*
+				 * By now, the buffer very possibly belongs to some other
+				 * rel, so check again before proceeding.
+				 */
+				goto recheck;
+			}
+			/* Now we can do what we came for */
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+			bufHdr->cntxDirty = false;
+
+			/*
+			 * Release any refcount we may have.
+			 *
+			 * This is very probably dead code, and if it isn't then it's
+			 * probably wrong.	I added the Assert to find out --- tgl
+			 * 11/99.
+			 */
+			if (!(bufHdr->flags & BM_FREE))
+			{
+				/* Assert checks that buffer will actually get freed! */
+				Assert(PrivateRefCount[i - 1] == 1 &&
+					   bufHdr->refcount == 1);
+				/* ReleaseBuffer expects we do not hold the lock at entry */
+				SpinRelease(BufMgrLock);
+				ReleaseBuffer(i);
+				SpinAcquire(BufMgrLock);
+			}
+			/*
+			 * And mark the buffer as no longer occupied by this rel.
+			 */
+			BufTableDelete(bufHdr);
+		}
+	}
+
+	SpinRelease(BufMgrLock);
+}
+
 /* ---------------------------------------------------------------------
  *		DropBuffers
  *
@@ -1256,7 +1343,7 @@ recheck:
  *		bothering to write them out first.  This is used when we destroy a
  *		database, to avoid trying to flush data to disk when the directory
  *		tree no longer exists.	Implementation is pretty similar to
- *		ReleaseRelationBuffers() which is for destroying just one relation.
+ *		DropRelationBuffers() which is for destroying just one relation.
  * --------------------------------------------------------------------
  */
 void
@@ -1399,33 +1486,32 @@ BufferPoolBlowaway()
 /* ---------------------------------------------------------------------
  *		FlushRelationBuffers
  *
- *		This function flushes all dirty pages of a relation out to disk.
+ *		This function writes all dirty pages of a relation out to disk.
  *		Furthermore, pages that have blocknumber >= firstDelBlock are
  *		actually removed from the buffer pool.  An error code is returned
  *		if we fail to dump a dirty buffer or if we find one of
  *		the target pages is pinned into the cache.
  *
- *		This is used by VACUUM before truncating the relation to the given
- *		number of blocks.  (TRUNCATE TABLE also uses it in the same way.)
- *		It might seem unnecessary to flush dirty pages before firstDelBlock,
- *		since VACUUM should already have committed its changes.  However,
- *		it is possible for there still to be dirty pages: if some page
- *		had unwritten on-row tuple status updates from a prior transaction,
- *		and VACUUM had no additional changes to make to that page, then
- *		VACUUM won't have written it.  This is harmless in most cases but
- *		will break pg_upgrade, which relies on VACUUM to ensure that *all*
- *		tuples have correct on-row status.  So, we check and flush all
- *		dirty pages of the rel regardless of block number.
+ *		This is called by DROP TABLE to clear buffers for the relation
+ *		from the buffer pool.  Note that we must write dirty buffers,
+ *		rather than just dropping the changes, because our transaction
+ *		might abort later on; we want to roll back safely in that case.
  *
- *		This is also used by RENAME TABLE (with firstDelBlock = 0)
- *		to clear out the buffer cache before renaming the physical files of
- *		a relation.  Without that, some other backend might try to do a
- *		blind write of a buffer page (relying on the BlindId of the buffer)
- *		and fail because it's not got the right filename anymore.
+ *		This is also called by VACUUM before truncating the relation to the
+ *		given number of blocks.  It might seem unnecessary for VACUUM to
+ *		write dirty pages before firstDelBlock, since VACUUM should already
+ *		have committed its changes.  However, it is possible for there still
+ *		to be dirty pages: if some page had unwritten on-row tuple status
+ *		updates from a prior transaction, and VACUUM had no additional
+ *		changes to make to that page, then VACUUM won't have written it.
+ *		This is harmless in most cases but will break pg_upgrade, which
+ *		relies on VACUUM to ensure that *all* tuples have correct on-row
+ *		status.  So, we check and flush all dirty pages of the rel
+ *		regardless of block number.
  *
  *		In all cases, the caller should be holding AccessExclusiveLock on
  *		the target relation to ensure that no other backend is busy reading
- *		more blocks of the relation.
+ *		more blocks of the relation (or might do so before we commit).
  *
  *		Formerly, we considered it an error condition if we found dirty
  *		buffers here.	However, since BufferSync no longer forces out all
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 23a2dcf1e2..14325e5318 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.71 2000/07/17 03:05:08 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.72 2000/11/08 22:10:00 tgl Exp $
  *
  * NOTES
  *	  Outside modules can create a lock table and acquire/release
@@ -453,7 +453,7 @@ LockMethodTableRename(LOCKMETHOD lockmethod)
 bool
 LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 {
-	XIDLookupEnt *result,
+	XIDLookupEnt *xident,
 				item;
 	HTAB	   *xidTable;
 	bool		found;
@@ -559,9 +559,9 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	/*
 	 * Find or create an xid entry with this tag
 	 */
-	result = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
+	xident = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
 										  HASH_ENTER, &found);
-	if (!result)
+	if (!xident)
 	{
 		SpinRelease(masterLock);
 		elog(NOTICE, "LockAcquire: xid table corrupted");
@@ -573,16 +573,41 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	 */
 	if (!found)
 	{
-		result->nHolding = 0;
-		MemSet((char *) result->holders, 0, sizeof(int) * MAX_LOCKMODES);
-		ProcAddLock(&result->queue);
-		XID_PRINT("LockAcquire: new", result);
+		xident->nHolding = 0;
+		MemSet((char *) xident->holders, 0, sizeof(int) * MAX_LOCKMODES);
+		ProcAddLock(&xident->queue);
+		XID_PRINT("LockAcquire: new", xident);
 	}
 	else
 	{
-		XID_PRINT("LockAcquire: found", result);
-		Assert((result->nHolding > 0) && (result->holders[lockmode] >= 0));
-		Assert(result->nHolding <= lock->nActive);
+		int			i;
+
+		XID_PRINT("LockAcquire: found", xident);
+		Assert((xident->nHolding > 0) && (xident->holders[lockmode] >= 0));
+		Assert(xident->nHolding <= lock->nActive);
+		/*
+		 * Issue warning if we already hold a lower-level lock on this
+		 * object and do not hold a lock of the requested level or higher.
+		 * This indicates a deadlock-prone coding practice (eg, we'd have
+		 * a deadlock if another backend were following the same code path
+		 * at about the same time).
+		 *
+		 * XXX Doing numeric comparison on the lockmodes is a hack;
+		 * it'd be better to use a table.  For now, though, this works.
+		 */
+		for (i = lockMethodTable->ctl->numLockModes; i > 0; i--)
+		{
+			if (xident->holders[i] > 0)
+			{
+				if (i >= (int) lockmode)
+					break;		/* safe: we have a lock >= req level */
+				elog(DEBUG, "Deadlock risk: raising lock level"
+					 " from %s to %s on object %u/%u/%u",
+					 lock_types[i], lock_types[lockmode],
+					 lock->tag.relId, lock->tag.dbId, lock->tag.objId.blkno);
+				break;
+			}
+		}
 	}
 
 	/* ----------------
@@ -601,12 +626,12 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	 * hold this lock.
 	 * --------------------
 	 */
-	if (result->nHolding == lock->nActive || result->holders[lockmode] != 0)
+	if (xident->nHolding == lock->nActive || xident->holders[lockmode] != 0)
 	{
-		result->holders[lockmode]++;
-		result->nHolding++;
-		XID_PRINT("LockAcquire: owning", result);
-		Assert((result->nHolding > 0) && (result->holders[lockmode] > 0));
+		xident->holders[lockmode]++;
+		xident->nHolding++;
+		XID_PRINT("LockAcquire: owning", xident);
+		Assert((xident->nHolding > 0) && (xident->holders[lockmode] > 0));
 		GrantLock(lock, lockmode);
 		SpinRelease(masterLock);
 		return TRUE;
@@ -623,27 +648,27 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 		 * If I don't hold locks or my locks don't conflict with waiters
 		 * then force to sleep.
 		 */
-		if (result->nHolding > 0)
+		if (xident->nHolding > 0)
 		{
 			for (; i <= lockMethodTable->ctl->numLockModes; i++)
 			{
-				if (result->holders[i] > 0 &&
+				if (xident->holders[i] > 0 &&
 					lockMethodTable->ctl->conflictTab[i] & lock->waitMask)
 					break;		/* conflict */
 			}
 		}
 
-		if (result->nHolding == 0 || i > lockMethodTable->ctl->numLockModes)
+		if (xident->nHolding == 0 || i > lockMethodTable->ctl->numLockModes)
 		{
 			XID_PRINT("LockAcquire: higher priority proc waiting",
-					  result);
+					  xident);
 			status = STATUS_FOUND;
 		}
 		else
-			status = LockResolveConflicts(lockmethod, lock, lockmode, xid, result);
+			status = LockResolveConflicts(lockmethod, lock, lockmode, xid, xident);
 	}
 	else
-		status = LockResolveConflicts(lockmethod, lock, lockmode, xid, result);
+		status = LockResolveConflicts(lockmethod, lock, lockmode, xid, xident);
 
 	if (status == STATUS_OK)
 		GrantLock(lock, lockmode);
@@ -657,17 +682,17 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 		 */
 		if (lockmethod == USER_LOCKMETHOD)
 		{
-			if (!result->nHolding)
+			if (!xident->nHolding)
 			{
-				SHMQueueDelete(&result->queue);
-				result = (XIDLookupEnt *) hash_search(xidTable,
-													  (Pointer) result,
+				SHMQueueDelete(&xident->queue);
+				xident = (XIDLookupEnt *) hash_search(xidTable,
+													  (Pointer) xident,
 													HASH_REMOVE, &found);
-				if (!result || !found)
+				if (!xident || !found)
 					elog(NOTICE, "LockAcquire: remove xid, table corrupted");
 			}
 			else
-				XID_PRINT("LockAcquire: NHOLDING", result);
+				XID_PRINT("LockAcquire: NHOLDING", xident);
 			lock->nHolding--;
 			lock->holders[lockmode]--;
 			LOCK_PRINT("LockAcquire: user lock failed", lock, lockmode);
@@ -682,7 +707,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 		 * Construct bitmask of locks we hold before going to sleep.
 		 */
 		MyProc->holdLock = 0;
-		if (result->nHolding > 0)
+		if (xident->nHolding > 0)
 		{
 			int			i,
 						tmpMask = 2;
@@ -690,7 +715,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 			for (i = 1; i <= lockMethodTable->ctl->numLockModes;
 				 i++, tmpMask <<= 1)
 			{
-				if (result->holders[i] > 0)
+				if (xident->holders[i] > 0)
 					MyProc->holdLock |= tmpMask;
 			}
 			Assert(MyProc->holdLock != 0);
@@ -702,15 +727,15 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 		 * Check the xid entry status, in case something in the ipc
 		 * communication doesn't work correctly.
 		 */
-		if (!((result->nHolding > 0) && (result->holders[lockmode] > 0)))
+		if (!((xident->nHolding > 0) && (xident->holders[lockmode] > 0)))
 		{
-			XID_PRINT("LockAcquire: INCONSISTENT", result);
+			XID_PRINT("LockAcquire: INCONSISTENT", xident);
 			LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
 			/* Should we retry ? */
 			SpinRelease(masterLock);
 			return FALSE;
 		}
-		XID_PRINT("LockAcquire: granted", result);
+		XID_PRINT("LockAcquire: granted", xident);
 		LOCK_PRINT("LockAcquire: granted", lock, lockmode);
 	}
 
@@ -738,7 +763,7 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 					 TransactionId xid,
 					 XIDLookupEnt *xidentP)		/* xident ptr or NULL */
 {
-	XIDLookupEnt *result,
+	XIDLookupEnt *xident,
 				item;
 	int		   *myHolders;
 	int			numLockModes;
@@ -758,7 +783,7 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 		 * A pointer to the xid entry was supplied from the caller.
 		 * Actually only LockAcquire can do it.
 		 */
-		result = xidentP;
+		xident = xidentP;
 	}
 	else
 	{
@@ -788,9 +813,9 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 		/*
 		 * Find or create an xid entry with this tag
 		 */
-		result = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
+		xident = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
 											  HASH_ENTER, &found);
-		if (!result)
+		if (!xident)
 		{
 			elog(NOTICE, "LockResolveConflicts: xid table corrupted");
 			return STATUS_ERROR;
@@ -808,14 +833,14 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 			 * the lock stats.
 			 * ---------------
 			 */
-			MemSet(result->holders, 0, numLockModes * sizeof(*(lock->holders)));
-			result->nHolding = 0;
-			XID_PRINT("LockResolveConflicts: NOT FOUND", result);
+			MemSet(xident->holders, 0, numLockModes * sizeof(*(lock->holders)));
+			xident->nHolding = 0;
+			XID_PRINT("LockResolveConflicts: NOT FOUND", xident);
 		}
 		else
-			XID_PRINT("LockResolveConflicts: found", result);
+			XID_PRINT("LockResolveConflicts: found", xident);
 	}
-	Assert((result->nHolding >= 0) && (result->holders[lockmode] >= 0));
+	Assert((xident->nHolding >= 0) && (xident->holders[lockmode] >= 0));
 
 	/* ----------------------------
 	 * first check for global conflicts: If no locks conflict
@@ -829,10 +854,10 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 	 */
 	if (!(LockMethodTable[lockmethod]->ctl->conflictTab[lockmode] & lock->mask))
 	{
-		result->holders[lockmode]++;
-		result->nHolding++;
-		XID_PRINT("LockResolveConflicts: no conflict", result);
-		Assert((result->nHolding > 0) && (result->holders[lockmode] > 0));
+		xident->holders[lockmode]++;
+		xident->nHolding++;
+		XID_PRINT("LockResolveConflicts: no conflict", xident);
+		Assert((xident->nHolding > 0) && (xident->holders[lockmode] > 0));
 		return STATUS_OK;
 	}
 
@@ -842,7 +867,7 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 	 * that does not reflect our own locks.
 	 * ------------------------
 	 */
-	myHolders = result->holders;
+	myHolders = xident->holders;
 	bitmask = 0;
 	tmpMask = 2;
 	for (i = 1; i <= numLockModes; i++, tmpMask <<= 1)
@@ -861,14 +886,14 @@ LockResolveConflicts(LOCKMETHOD lockmethod,
 	if (!(LockMethodTable[lockmethod]->ctl->conflictTab[lockmode] & bitmask))
 	{
 		/* no conflict. Get the lock and go on */
-		result->holders[lockmode]++;
-		result->nHolding++;
-		XID_PRINT("LockResolveConflicts: resolved", result);
-		Assert((result->nHolding > 0) && (result->holders[lockmode] > 0));
+		xident->holders[lockmode]++;
+		xident->nHolding++;
+		XID_PRINT("LockResolveConflicts: resolved", xident);
+		Assert((xident->nHolding > 0) && (xident->holders[lockmode] > 0));
 		return STATUS_OK;
 	}
 
-	XID_PRINT("LockResolveConflicts: conflicting", result);
+	XID_PRINT("LockResolveConflicts: conflicting", xident);
 	return STATUS_FOUND;
 }
 
@@ -965,7 +990,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	SPINLOCK	masterLock;
 	bool		found;
 	LOCKMETHODTABLE *lockMethodTable;
-	XIDLookupEnt *result,
+	XIDLookupEnt *xident,
 				item;
 	HTAB	   *xidTable;
 	TransactionId xid;
@@ -1053,9 +1078,9 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	 * Find an xid entry with this tag
 	 */
 	xidTable = lockMethodTable->xidHash;
-	result = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
+	xident = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &item,
 										  HASH_FIND_SAVE, &found);
-	if (!result || !found)
+	if (!xident || !found)
 	{
 		SpinRelease(masterLock);
 #ifdef USER_LOCKS
@@ -1066,23 +1091,23 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 			elog(NOTICE, "LockRelease: xid table corrupted");
 		return FALSE;
 	}
-	XID_PRINT("LockRelease: found", result);
-	Assert(result->tag.lock == MAKE_OFFSET(lock));
+	XID_PRINT("LockRelease: found", xident);
+	Assert(xident->tag.lock == MAKE_OFFSET(lock));
 
 	/*
 	 * Check that we are actually holding a lock of the type we want to
 	 * release.
 	 */
-	if (!(result->holders[lockmode] > 0))
+	if (!(xident->holders[lockmode] > 0))
 	{
 		SpinRelease(masterLock);
-		XID_PRINT("LockAcquire: WRONGTYPE", result);
+		XID_PRINT("LockAcquire: WRONGTYPE", xident);
 		elog(NOTICE, "LockRelease: you don't own a lock of type %s",
 			 lock_types[lockmode]);
-		Assert(result->holders[lockmode] >= 0);
+		Assert(xident->holders[lockmode] >= 0);
 		return FALSE;
 	}
-	Assert(result->nHolding > 0);
+	Assert(xident->nHolding > 0);
 
 	/*
 	 * fix the general lock stats
@@ -1147,27 +1172,27 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag, LOCKMODE lockmode)
 	 * now check to see if I have any private locks.  If I do, decrement
 	 * the counts associated with them.
 	 */
-	result->holders[lockmode]--;
-	result->nHolding--;
-	XID_PRINT("LockRelease: updated", result);
-	Assert((result->nHolding >= 0) && (result->holders[lockmode] >= 0));
+	xident->holders[lockmode]--;
+	xident->nHolding--;
+	XID_PRINT("LockRelease: updated", xident);
+	Assert((xident->nHolding >= 0) && (xident->holders[lockmode] >= 0));
 
 	/*
 	 * If this was my last hold on this lock, delete my entry in the XID
 	 * table.
 	 */
-	if (!result->nHolding)
+	if (!xident->nHolding)
 	{
-		if (result->queue.prev == INVALID_OFFSET)
+		if (xident->queue.prev == INVALID_OFFSET)
 			elog(NOTICE, "LockRelease: xid.prev == INVALID_OFFSET");
-		if (result->queue.next == INVALID_OFFSET)
+		if (xident->queue.next == INVALID_OFFSET)
 			elog(NOTICE, "LockRelease: xid.next == INVALID_OFFSET");
-		if (result->queue.next != INVALID_OFFSET)
-			SHMQueueDelete(&result->queue);
-		XID_PRINT("LockRelease: deleting", result);
-		result = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &result,
+		if (xident->queue.next != INVALID_OFFSET)
+			SHMQueueDelete(&xident->queue);
+		XID_PRINT("LockRelease: deleting", xident);
+		xident = (XIDLookupEnt *) hash_search(xidTable, (Pointer) &xident,
 											  HASH_REMOVE_SAVED, &found);
-		if (!result || !found)
+		if (!xident || !found)
 		{
 			SpinRelease(masterLock);
 			elog(NOTICE, "LockRelease: remove xid, table corrupted");
@@ -1196,7 +1221,7 @@ LockReleaseAll(LOCKMETHOD lockmethod, SHM_QUEUE *lockQueue)
 	int			done;
 	XIDLookupEnt *xidLook = NULL;
 	XIDLookupEnt *tmp = NULL;
-	XIDLookupEnt *result;
+	XIDLookupEnt *xident;
 	SHMEM_OFFSET end = MAKE_OFFSET(lockQueue);
 	SPINLOCK	masterLock;
 	LOCKMETHODTABLE *lockMethodTable;
@@ -1371,11 +1396,11 @@ LockReleaseAll(LOCKMETHOD lockmethod, SHM_QUEUE *lockQueue)
 		 */
 
 		XID_PRINT("LockReleaseAll: deleting", xidLook);
-		result = (XIDLookupEnt *) hash_search(lockMethodTable->xidHash,
+		xident = (XIDLookupEnt *) hash_search(lockMethodTable->xidHash,
 											  (Pointer) xidLook,
 											  HASH_REMOVE,
 											  &found);
-		if (!result || !found)
+		if (!xident || !found)
 		{
 			SpinRelease(masterLock);
 			elog(NOTICE, "LockReleaseAll: xid table corrupted");
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index da466afe9f..c97a46ba4b 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,17 +8,17 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.77 2000/10/28 16:20:57 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.78 2000/11/08 22:10:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
+#include "postgres.h"
+
 #include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/file.h>
 
-#include "postgres.h"
-
 #include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "storage/smgr.h"
@@ -123,63 +123,39 @@ mdinit()
 int
 mdcreate(Relation reln)
 {
+	char	   *path;
 	int			fd,
 				vfd;
-	char	   *path;
 
-	Assert(reln->rd_unlinked && reln->rd_fd < 0);
+	Assert(reln->rd_fd < 0);
 
 	path = relpath(reln->rd_node);
-	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 
-	/*
-	 * For cataloged relations, pg_class is guaranteed to have a unique
-	 * record with the same relname by the unique index. So we are able to
-	 * reuse existent files for new cataloged relations. Currently we reuse
-	 * them in the following cases. 1. they are empty. 2. they are used
-	 * for Index relations and their size == BLCKSZ * 2.
-	 *
-	 * During bootstrap processing, we skip that check, because pg_time,
-	 * pg_variable, and pg_log get created before their .bki file entries
-	 * are processed.
-	 */
+	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 
 	if (fd < 0)
 	{
 		int		save_errno = errno;
 
-		if (!IsBootstrapProcessingMode() &&
-			reln->rd_rel->relkind == RELKIND_UNCATALOGED)
-			return -1;
-
-		fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+		/*
+		 * During bootstrap, there are cases where a system relation will be
+		 * accessed (by internal backend processes) before the bootstrap
+		 * script nominally creates it.  Therefore, allow the file to exist
+		 * already, but in bootstrap mode only.  (See also mdopen)
+		 */
+		if (IsBootstrapProcessingMode())
+			fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 		if (fd < 0)
 		{
+			pfree(path);
 			/* be sure to return the error reported by create, not open */
 			errno = save_errno;
 			return -1;
 		}
-		if (!IsBootstrapProcessingMode())
-		{
-			bool		reuse = false;
-			long		len = FileSeek(fd, 0L, SEEK_END);
-
-			if (len == 0)
-				reuse = true;
-			else if (reln->rd_rel->relkind == RELKIND_INDEX &&
-					 len == BLCKSZ * 2)
-				reuse = true;
-			if (!reuse)
-			{
-				FileClose(fd);
-				/* be sure to return the error reported by create */
-				errno = save_errno;
-				return -1;
-			}
-		}
 		errno = 0;
 	}
-	reln->rd_unlinked = false;
+
+	pfree(path);
 
 	vfd = _fdvec_alloc();
 	if (vfd < 0)
@@ -187,12 +163,10 @@ mdcreate(Relation reln)
 
 	Md_fdvec[vfd].mdfd_vfd = fd;
 	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
+	Md_fdvec[vfd].mdfd_lstbcnt = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
 	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
 #endif
-	Md_fdvec[vfd].mdfd_lstbcnt = 0;
-
-	pfree(path);
 
 	return vfd;
 }
@@ -201,65 +175,50 @@ mdcreate(Relation reln)
  *	mdunlink() -- Unlink a relation.
  */
 int
-mdunlink(Relation reln)
+mdunlink(RelFileNode rnode)
 {
-	int			nblocks;
-	int			fd;
-	MdfdVec    *v;
-
-	/*
-	 * If the relation is already unlinked,we have nothing to do any more.
-	 */
-	if (reln->rd_unlinked && reln->rd_fd < 0)
-		return SM_SUCCESS;
-
-	/*
-	 * Force all segments of the relation to be opened, so that we won't
-	 * miss deleting any of them.
-	 */
-	nblocks = mdnblocks(reln);
+	int			status = SM_SUCCESS;
+	int			save_errno = 0;
+	char	   *path;
 
-	/*
-	 * Clean out the mdfd vector, letting fd.c unlink the physical files.
-	 *
-	 * NOTE: We truncate the file(s) before deleting 'em, because if other
-	 * backends are holding the files open, the unlink will fail on some
-	 * platforms (think Microsoft).  Better a zero-size file gets left
-	 * around than a big file.	Those other backends will be forced to
-	 * close the relation by cache invalidation, but that probably hasn't
-	 * happened yet.
-	 */
-	fd = RelationGetFile(reln);
-	if (fd < 0)					/* should not happen */
-		elog(ERROR, "mdunlink: mdnblocks didn't open relation");
+	path = relpath(rnode);
 
-	Md_fdvec[fd].mdfd_flags = (uint16) 0;
+	/* Delete the first segment, or only segment if not doing segmenting */
+	if (unlink(path) < 0)
+	{
+		status = SM_FAIL;
+		save_errno = errno;
+	}
 
 #ifndef LET_OS_MANAGE_FILESIZE
-	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
+	/* Get the additional segments, if any */
+	if (status == SM_SUCCESS)
 	{
-		MdfdVec    *ov = v;
+		char	   *segpath = (char *) palloc(strlen(path) + 12);
+		int			segno;
 
-		FileTruncate(v->mdfd_vfd, 0);
-		FileUnlink(v->mdfd_vfd);
-		v = v->mdfd_chain;
-		if (ov != &Md_fdvec[fd])
-			pfree(ov);
+		for (segno = 1; ; segno++)
+		{
+			sprintf(segpath, "%s.%d", path, segno);
+			if (unlink(segpath) < 0)
+			{
+				/* ENOENT is expected after the last segment... */
+				if (errno != ENOENT)
+				{
+					status = SM_FAIL;
+					save_errno = errno;
+				}
+				break;
+			}
+		}
+		pfree(segpath);
 	}
-	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
-#else
-	v = &Md_fdvec[fd];
-	FileTruncate(v->mdfd_vfd, 0);
-	FileUnlink(v->mdfd_vfd);
 #endif
 
-	_fdvec_free(fd);
-
-	/* be sure to mark relation closed && unlinked */
-	reln->rd_fd = -1;
-	reln->rd_unlinked = true;
+	pfree(path);
 
-	return SM_SUCCESS;
+	errno = save_errno;
+	return status;
 }
 
 /*
@@ -327,24 +286,29 @@ mdopen(Relation reln)
 	int			vfd;
 
 	Assert(reln->rd_fd < 0);
+
 	path = relpath(reln->rd_node);
 
 	fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
+
 	if (fd < 0)
 	{
-		/* in bootstrap mode, accept mdopen as substitute for mdcreate */
+		/*
+		 * During bootstrap, there are cases where a system relation will be
+		 * accessed (by internal backend processes) before the bootstrap
+		 * script nominally creates it.  Therefore, accept mdopen() as a
+		 * substitute for mdcreate() in bootstrap mode only.  (See mdcreate)
+		 */
 		if (IsBootstrapProcessingMode())
 			fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 		if (fd < 0)
 		{
-			elog(NOTICE, "mdopen: couldn't open %s: %m", path);
-			/* mark relation closed and unlinked */
-			reln->rd_fd = -1;
-			reln->rd_unlinked = true;
+			pfree(path);
 			return -1;
 		}
 	}
-	reln->rd_unlinked = false;
+
+	pfree(path);
 
 	vfd = _fdvec_alloc();
 	if (vfd < 0)
@@ -362,8 +326,6 @@ mdopen(Relation reln)
 #endif
 #endif
 
-	pfree(path);
-
 	return vfd;
 }
 
diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c
index a5b22cbcc5..d64aeb6a41 100644
--- a/src/backend/storage/smgr/mm.c
+++ b/src/backend/storage/smgr/mm.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.19 2000/04/10 23:41:51 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.20 2000/11/08 22:10:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -204,9 +204,11 @@ mmcreate(Relation reln)
 
 /*
  *	mmunlink() -- Unlink a relation.
+ *
+ * XXX currently broken: needs to accept RelFileNode, not Relation
  */
 int
-mmunlink(Relation reln)
+mmunlink(RelFileNode rnode)
 {
 	int			i;
 	Oid			reldbid;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d2a940a76e..01a7877e80 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,13 +11,16 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.42 2000/10/28 16:20:57 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.43 2000/11/08 22:10:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include "storage/bufmgr.h"
 #include "storage/smgr.h"
+#include "utils/memutils.h"
+
 
 static void smgrshutdown(void);
 
@@ -26,7 +29,7 @@ typedef struct f_smgr
 	int			(*smgr_init) (void);	/* may be NULL */
 	int			(*smgr_shutdown) (void);		/* may be NULL */
 	int			(*smgr_create) (Relation reln);
-	int			(*smgr_unlink) (Relation reln);
+	int			(*smgr_unlink) (RelFileNode rnode);
 	int			(*smgr_extend) (Relation reln, char *buffer);
 	int			(*smgr_open) (Relation reln);
 	int			(*smgr_close) (Relation reln);
@@ -60,10 +63,11 @@ static f_smgr smgrsw[] = {
 	{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
 		mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
 #ifdef XLOG
-	mdnblocks, mdtruncate, mdcommit, mdabort, mdsync},
+	mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
 #else
-	mdnblocks, mdtruncate, mdcommit, mdabort},
+	mdnblocks, mdtruncate, mdcommit, mdabort
 #endif
+	},
 
 #ifdef STABLE_MEMORY_STORAGE
 	/* main memory */
@@ -93,6 +97,31 @@ static bool smgrwo[] = {
 
 static int	NSmgr = lengthof(smgrsw);
 
+/*
+ * We keep a list of all relations (represented as RelFileNode values)
+ * that have been created or deleted in the current transaction.  When
+ * a relation is created, we create the physical file immediately, but
+ * remember it so that we can delete the file again if the current
+ * transaction is aborted.  Conversely, a deletion request is NOT
+ * executed immediately, but is just entered in the list.  When and if
+ * the transaction commits, we can delete the physical file.
+ *
+ * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
+ * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
+ * but I'm being paranoid.
+ */
+
+typedef struct PendingRelDelete
+{
+	RelFileNode relnode;		/* relation that may need to be deleted */
+	int16 which;				/* which storage manager? */
+	bool atCommit;				/* T=delete at commit; F=delete at abort */
+	struct PendingRelDelete *next; /* linked-list link */
+} PendingRelDelete;
+
+static PendingRelDelete *pendingDeletes = NULL;	/* head of linked list */
+
+
 /*
  *	smgrinit(), smgrshutdown() -- Initialize or shut down all storage
  *								  managers.
@@ -147,27 +176,58 @@ int
 smgrcreate(int16 which, Relation reln)
 {
 	int			fd;
+	PendingRelDelete *pending;
 
 	if ((fd = (*(smgrsw[which].smgr_create)) (reln)) < 0)
 		elog(ERROR, "cannot create %s: %m", RelationGetRelationName(reln));
 
+	/* Add the relation to the list of stuff to delete at abort */
+	pending = (PendingRelDelete *)
+		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+	pending->relnode = reln->rd_node;
+	pending->which = which;
+	pending->atCommit = false;	/* delete if abort */
+	pending->next = pendingDeletes;
+	pendingDeletes = pending;
+
 	return fd;
 }
 
 /*
  *	smgrunlink() -- Unlink a relation.
  *
- *		The relation is removed from the store.
+ *		The relation is removed from the store.  Actually, we just remember
+ *		that we want to do this at transaction commit.
  */
 int
 smgrunlink(int16 which, Relation reln)
 {
-	int			status;
-
-	if ((status = (*(smgrsw[which].smgr_unlink)) (reln)) == SM_FAIL)
-		elog(ERROR, "cannot unlink %s: %m", RelationGetRelationName(reln));
+	PendingRelDelete *pending;
+
+	/* Make sure the file is closed */
+	if (reln->rd_fd >= 0)
+		smgrclose(which, reln);
+
+	/* Add the relation to the list of stuff to delete at commit */
+	pending = (PendingRelDelete *)
+		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+	pending->relnode = reln->rd_node;
+	pending->which = which;
+	pending->atCommit = true;	/* delete if commit */
+	pending->next = pendingDeletes;
+	pendingDeletes = pending;
+
+	/*
+	 * NOTE: if the relation was created in this transaction, it will now
+	 * be present in the pending-delete list twice, once with atCommit true
+	 * and once with atCommit false.  Hence, it will be physically deleted
+	 * at end of xact in either case (and the other entry will be ignored
+	 * by smgrDoPendingDeletes, so no error will occur).  We could instead
+	 * remove the existing list entry and delete the physical file
+	 * immediately, but for now I'll keep the logic simple.
+	 */
 
-	return status;
+	return SM_SUCCESS;
 }
 
 /*
@@ -193,17 +253,18 @@ smgrextend(int16 which, Relation reln, char *buffer)
 /*
  *	smgropen() -- Open a relation using a particular storage manager.
  *
- *		Returns the fd for the open relation on success, aborts the
- *		transaction on failure.
+ *		Returns the fd for the open relation on success.
+ *
+ *		On failure, returns -1 if failOK, else aborts the transaction.
  */
 int
-smgropen(int16 which, Relation reln)
+smgropen(int16 which, Relation reln, bool failOK)
 {
 	int			fd;
 
-	if ((fd = (*(smgrsw[which].smgr_open)) (reln)) < 0 &&
-		!reln->rd_unlinked)
-		elog(ERROR, "cannot open %s: %m", RelationGetRelationName(reln));
+	if ((fd = (*(smgrsw[which].smgr_open)) (reln)) < 0)
+		if (! failOK)
+			elog(ERROR, "cannot open %s: %m", RelationGetRelationName(reln));
 
 	return fd;
 }
@@ -211,12 +272,6 @@ smgropen(int16 which, Relation reln)
 /*
  *	smgrclose() -- Close a relation.
  *
- *		NOTE: underlying manager should allow case where relation is
- *		already closed.  Indeed relation may have been unlinked!
- *		This is currently called only from RelationFlushRelation() when
- *		the relation cache entry is about to be dropped; could be doing
- *		simple relation cache clear, or finishing up DROP TABLE.
- *
  *		Returns SM_SUCCESS on success, aborts on failure.
  */
 int
@@ -411,6 +466,41 @@ smgrtruncate(int16 which, Relation reln, int nblocks)
 	return newblks;
 }
 
+/*
+ * smgrDoPendingDeletes() -- take care of relation deletes at end of xact.
+ */
+int
+smgrDoPendingDeletes(bool isCommit)
+{
+	while (pendingDeletes != NULL)
+	{
+		PendingRelDelete *pending = pendingDeletes;
+
+		pendingDeletes = pending->next;
+		if (pending->atCommit == isCommit)
+		{
+			/*
+			 * Get rid of any leftover buffers for the rel (shouldn't be
+			 * any in the commit case, but there can be in the abort case).
+			 */
+			DropRelFileNodeBuffers(pending->relnode);
+			/*
+			 * And delete the physical files.
+			 *
+			 * Note: we treat deletion failure as a NOTICE, not an error,
+			 * because we've already decided to commit or abort the current
+			 * xact.
+			 */
+			if ((*(smgrsw[pending->which].smgr_unlink)) (pending->relnode) == SM_FAIL)
+				elog(NOTICE, "cannot unlink %u/%u: %m",
+					 pending->relnode.tblNode, pending->relnode.relNode);
+		}
+		pfree(pending);
+	}
+
+	return SM_SUCCESS;
+}
+
 /*
  *	smgrcommit(), smgrabort() -- Commit or abort changes made during the
  *								 current transaction.
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index e218daa748..8f4fd626f8 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/inval.c,v 1.37 2000/06/08 19:51:03 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/inval.c,v 1.38 2000/11/08 22:10:01 tgl Exp $
  *
  * Note - this code is real crufty...
  *
@@ -80,10 +80,10 @@ typedef InvalidationMessageData *InvalidationMessage;
 
 /*
  * ----------------
- *	Invalidation info was devided into three parts.
- *	1) shared invalidation to be registerd for all backends
+ *	Invalidation info is divided into three parts.
+ *	1) shared invalidation to be registered for all backends
  *	2) local invalidation for the transaction itself
- *	3) rollback information for the transaction itself
+ *	3) rollback information for the transaction itself (in case we abort)
  * ----------------
  */
 
@@ -160,7 +160,9 @@ LocalInvalidRegister(LocalInvalid invalid,
  * --------------------------------
  */
 static void
-			LocalInvalidInvalidate(LocalInvalid invalid, void (*function) (), bool freemember)
+LocalInvalidInvalidate(LocalInvalid invalid,
+					   void (*function) (),
+					   bool freemember)
 {
 	InvalidationEntryData *entryDataP;
 
@@ -216,15 +218,10 @@ elog(DEBUG, "CacheIdRegisterLocalInvalid(%d, %d, [%d, %d])", \
 elog(DEBUG, "CacheIdRegisterLocalRollback(%d, %d, [%d, %d])", \
 	 cacheId, hashIndex, ItemPointerGetBlockNumber(pointer), \
 	 ItemPointerGetOffsetNumber(pointer))
-#define CacheIdImmediateRegisterSharedInvalid_DEBUG1 \
-elog(DEBUG, "CacheIdImmediateRegisterSharedInvalid(%d, %d, [%d, %d])", \
-	 cacheId, hashIndex, ItemPointerGetBlockNumber(pointer), \
-	 ItemPointerGetOffsetNumber(pointer))
 #else
 #define CacheIdRegisterSpecifiedLocalInvalid_DEBUG1
 #define CacheIdRegisterLocalInvalid_DEBUG1
 #define CacheIdRegisterLocalRollback_DEBUG1
-#define CacheIdImmediateRegisterSharedInvalid_DEBUG1
 #endif	 /* INVALIDDEBUG */
 
 /* --------------------------------
@@ -233,7 +230,9 @@ elog(DEBUG, "CacheIdImmediateRegisterSharedInvalid(%d, %d, [%d, %d])", \
  */
 static LocalInvalid
 CacheIdRegisterSpecifiedLocalInvalid(LocalInvalid invalid,
-					 Index cacheId, Index hashIndex, ItemPointer pointer)
+									 Index cacheId,
+									 Index hashIndex,
+									 ItemPointer pointer)
 {
 	InvalidationMessage message;
 
@@ -317,43 +316,6 @@ CacheIdRegisterLocalRollback(Index cacheId, Index hashIndex,
 							 RollbackStack, cacheId, hashIndex, pointer);
 }
 
-/* --------------------------------
- *		CacheIdImmediateRegisterSharedInvalid
- * --------------------------------
- */
-static void
-CacheIdImmediateRegisterSharedInvalid(Index cacheId, Index hashIndex,
-									  ItemPointer pointer)
-{
-	InvalidationMessage message;
-
-	/* ----------------
-	 *	debugging stuff
-	 * ----------------
-	 */
-	CacheIdImmediateRegisterSharedInvalid_DEBUG1;
-
-	/* ----------------
-	 *	create a message describing the system catalog tuple
-	 *	we wish to invalidate.
-	 * ----------------
-	 */
-	message = (InvalidationMessage)
-		InvalidationEntryAllocate(sizeof(InvalidationMessageData));
-
-	message->kind = 'c';
-	message->any.catalog.cacheId = cacheId;
-	message->any.catalog.hashIndex = hashIndex;
-
-	ItemPointerCopy(pointer, &message->any.catalog.pointerData);
-	/* ----------------
-	 *	Register a shared catalog cache invalidation.
-	 * ----------------
-	 */
-	InvalidationMessageRegisterSharedInvalid(message);
-	free((Pointer) &((InvalidationUserData *) message)->dataP[-1]);
-}
-
 /* --------------------------------
  *		RelationIdRegisterSpecifiedLocalInvalid
  * --------------------------------
@@ -448,44 +410,6 @@ RelationIdRegisterLocalRollback(Oid relationId, Oid objectId)
 									RollbackStack, relationId, objectId);
 }
 
-/* --------------------------------
- *		RelationIdImmediateRegisterSharedInvalid
- * --------------------------------
- */
-static void
-RelationIdImmediateRegisterSharedInvalid(Oid relationId, Oid objectId)
-{
-	InvalidationMessage message;
-
-	/* ----------------
-	 *	debugging stuff
-	 * ----------------
-	 */
-#ifdef	INVALIDDEBUG
-	elog(DEBUG, "RelationImmediateRegisterSharedInvalid(%u, %u)", relationId,
-		 objectId);
-#endif	 /* defined(INVALIDDEBUG) */
-
-	/* ----------------
-	 *	create a message describing the relation descriptor
-	 *	we wish to invalidate.
-	 * ----------------
-	 */
-	message = (InvalidationMessage)
-		InvalidationEntryAllocate(sizeof(InvalidationMessageData));
-
-	message->kind = 'r';
-	message->any.relation.relationId = relationId;
-	message->any.relation.objectId = objectId;
-
-	/* ----------------
-	 *	Register a shared catalog cache invalidation.
-	 * ----------------
-	 */
-	InvalidationMessageRegisterSharedInvalid(message);
-	free((Pointer) &((InvalidationUserData *) message)->dataP[-1]);
-}
-
 /* --------------------------------
  *		CacheIdInvalidate
  *
@@ -890,55 +814,3 @@ RelationMark4RollbackHeapTuple(Relation relation, HeapTuple tuple)
 								RelationIdRegisterLocalRollback,
 								"RelationMark4RollbackHeapTuple");
 }
-
-/*
- * ImmediateInvalidateSharedHeapTuple
- *		Different from RelationInvalidateHeapTuple()
- *		this function queues shared invalidation info immediately.
- */
-void
-ImmediateInvalidateSharedHeapTuple(Relation relation, HeapTuple tuple)
-{
-	InvokeHeapTupleInvalidation(relation, tuple,
-								CacheIdImmediateRegisterSharedInvalid,
-								RelationIdImmediateRegisterSharedInvalid,
-								"ImmediateInvalidateSharedHeapTuple");
-}
-
-#ifdef NOT_USED
-/*
- * ImmediateSharedRelationCacheInvalidate
- *	Register shared relation cache invalidation immediately
- *
- *	This is needed for smgrunlink()/smgrtruncate().
- *	Those functions unlink/truncate the base file immediately
- *	and couldn't be rollbacked in case of abort/crash.
- *	So relation cache invalidation must be registerd immediately.
- *	Note:
- *		Assumes Relation is valid.
- */
-void
-ImmediateSharedRelationCacheInvalidate(Relation relation)
-{
-	/* ----------------
-	 *	sanity checks
-	 * ----------------
-	 */
-	Assert(RelationIsValid(relation));
-
-	if (IsBootstrapProcessingMode())
-		return;
-
-	/* ----------------
-	 *	debugging stuff
-	 * ----------------
-	 */
-#ifdef	INVALIDDEBUG
-	elog(DEBUG, "ImmediateSharedRelationCacheInvalidate(%s)", \
-		 RelationGetPhysicalRelationName(relation));
-#endif	 /* defined(INVALIDDEBUG) */
-
-	RelationIdImmediateRegisterSharedInvalid(
-							RelOid_pg_class, RelationGetRelid(relation));
-}
-#endif
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index ea7a8d0212..be902d7842 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.114 2000/10/28 16:20:57 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.115 2000/11/08 22:10:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -954,7 +954,6 @@ static Relation
 RelationBuildDesc(RelationBuildDescInfo buildinfo,
 				  Relation oldrelation)
 {
-	File		fd;
 	Relation	relation;
 	Oid			relid;
 	Oid			relam;
@@ -1069,18 +1068,10 @@ RelationBuildDesc(RelationBuildDescInfo buildinfo,
 	 *	by the storage manager code to rd_fd.
 	 * ----------------
 	 */
-	if (relation->rd_rel->relkind != RELKIND_VIEW) {
-		fd = smgropen(DEFAULT_SMGR, relation);
-
-		Assert(fd >= -1);
-		if (fd == -1)
-			elog(NOTICE, "RelationBuildDesc: smgropen(%s): %m",
-				 NameStr(relation->rd_rel->relname));
-
-		relation->rd_fd = fd;
-	} else {
+	if (relation->rd_rel->relkind != RELKIND_VIEW)
+		relation->rd_fd = smgropen(DEFAULT_SMGR, relation, false);
+	else
 		relation->rd_fd = -1;
-	}
 
 	/* ----------------
 	 *	insert newly created relation into proper relcaches,
@@ -1337,14 +1328,11 @@ RelationIdCacheGetRelation(Oid relationId)
 
 	if (RelationIsValid(rd))
 	{
+		/* re-open files if necessary */
 		if (rd->rd_fd == -1 && rd->rd_rel->relkind != RELKIND_VIEW)
-		{
-			rd->rd_fd = smgropen(DEFAULT_SMGR, rd);
-			Assert(rd->rd_fd != -1 || rd->rd_unlinked);
-		}
+			rd->rd_fd = smgropen(DEFAULT_SMGR, rd, false);
 
 		RelationIncrementReferenceCount(rd);
-
 	}
 
 	return rd;
@@ -1371,14 +1359,11 @@ RelationNameCacheGetRelation(const char *relationName)
 
 	if (RelationIsValid(rd))
 	{
+		/* re-open files if necessary */
 		if (rd->rd_fd == -1 && rd->rd_rel->relkind != RELKIND_VIEW)
-		{
-			rd->rd_fd = smgropen(DEFAULT_SMGR, rd);
-			Assert(rd->rd_fd != -1 || rd->rd_unlinked);
-		}
+			rd->rd_fd = smgropen(DEFAULT_SMGR, rd, false);
 
 		RelationIncrementReferenceCount(rd);
-
 	}
 
 	return rd;
@@ -1393,14 +1378,11 @@ RelationNodeCacheGetRelation(RelFileNode rnode)
 
 	if (RelationIsValid(rd))
 	{
+		/* re-open files if necessary */
 		if (rd->rd_fd == -1 && rd->rd_rel->relkind != RELKIND_VIEW)
-		{
-			rd->rd_fd = smgropen(DEFAULT_SMGR, rd);
-			Assert(rd->rd_fd != -1 || rd->rd_unlinked);
-		}
+			rd->rd_fd = smgropen(DEFAULT_SMGR, rd, false);
 
 		RelationIncrementReferenceCount(rd);
-
 	}
 
 	return rd;
@@ -1536,15 +1518,13 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 
 	/*
 	 * Make sure smgr and lower levels close the relation's files, if they
-	 * weren't closed already.  We do this unconditionally; if the
-	 * relation is not deleted, the next smgr access should reopen the
-	 * files automatically.  This ensures that the low-level file access
-	 * state is updated after, say, a vacuum truncation.
-	 *
-	 * NOTE: this call is a no-op if the relation's smgr file is already
-	 * closed or unlinked.
+	 * weren't closed already.  If the relation is not getting deleted,
+	 * the next smgr access should reopen the files automatically.  This
+	 * ensures that the low-level file access state is updated after, say,
+	 * a vacuum truncation.
 	 */
-	smgrclose(DEFAULT_SMGR, relation);
+	if (relation->rd_fd >= 0)
+		smgrclose(DEFAULT_SMGR, relation);
 
 	/*
 	 * Never, never ever blow away a nailed-in system relation, because
@@ -1617,7 +1597,6 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 		MemoryContext old_rulescxt = relation->rd_rulescxt;
 		TriggerDesc *old_trigdesc = relation->trigdesc;
 		int			old_nblocks = relation->rd_nblocks;
-		bool		relDescChanged = false;
 		RelationBuildDescInfo buildinfo;
 
 		buildinfo.infotype = INFO_RELID;
@@ -1644,7 +1623,6 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 		else
 		{
 			FreeTupleDesc(old_att);
-			relDescChanged = true;
 		}
 		if (equalRuleLocks(old_rules, relation->rd_rules))
 		{
@@ -1657,7 +1635,6 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 		{
 			if (old_rulescxt)
 				MemoryContextDelete(old_rulescxt);
-			relDescChanged = true;
 		}
 		if (equalTriggerDescs(old_trigdesc, relation->trigdesc))
 		{
@@ -1667,7 +1644,6 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 		else
 		{
 			FreeTriggerDesc(old_trigdesc);
-			relDescChanged = true;
 		}
 		relation->rd_nblocks = old_nblocks;
 
@@ -1675,14 +1651,7 @@ RelationClearRelation(Relation relation, bool rebuildIt)
 		 * this is kind of expensive, but I think we must do it in case
 		 * relation has been truncated...
 		 */
-		if (relation->rd_unlinked)
-			relation->rd_nblocks = 0;
-		else
-			relation->rd_nblocks = RelationGetNumberOfBlocks(relation);
-
-		if (relDescChanged && !RelationHasReferenceCountZero(relation))
-			elog(ERROR, "RelationClearRelation: relation %u modified while in use",
-				 buildinfo.i.info_id);
+		relation->rd_nblocks = RelationGetNumberOfBlocks(relation);
 	}
 }
 
@@ -1934,9 +1903,6 @@ RelationRegisterRelation(Relation relation)
 void
 RelationPurgeLocalRelation(bool xactCommitted)
 {
-	if (newlyCreatedRelns == NULL)
-		return;
-
 	while (newlyCreatedRelns)
 	{
 		List	   *l = newlyCreatedRelns;
@@ -1949,19 +1915,7 @@ RelationPurgeLocalRelation(bool xactCommitted)
 		newlyCreatedRelns = lnext(newlyCreatedRelns);
 		pfree(l);
 
-		if (!xactCommitted)
-		{
-			/*
-			 * remove the file if we abort. This is so that files for
-			 * tables created inside a transaction block get removed.
-			 */
-			if (! reln->rd_unlinked)
-			{
-				smgrunlink(DEFAULT_SMGR, reln);
-				reln->rd_unlinked = true;
-			}
-		}
-
+		/* XXX is this step still needed?  If so, why? */
 		if (!IsBootstrapProcessingMode())
 			RelationClearRelation(reln, false);
 	}
diff --git a/src/backend/utils/cache/temprel.c b/src/backend/utils/cache/temprel.c
index 460cf56a40..31591663ce 100644
--- a/src/backend/utils/cache/temprel.c
+++ b/src/backend/utils/cache/temprel.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/Attic/temprel.c,v 1.29 2000/10/19 23:06:24 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/Attic/temprel.c,v 1.30 2000/11/08 22:10:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -27,10 +27,10 @@
  * to drop the underlying physical relations at session shutdown.
  */
 
-#include <sys/types.h>
-
 #include "postgres.h"
 
+#include <sys/types.h>
+
 #include "catalog/heap.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
@@ -47,11 +47,19 @@ static List *temp_rels = NIL;
 
 typedef struct TempTable
 {
-	char	   *user_relname;	/* logical name of temp table */
-	char	   *relname;		/* underlying unique name */
+	NameData	user_relname;	/* logical name of temp table */
+	NameData	relname;		/* underlying unique name */
 	Oid			relid;			/* needed properties of rel */
 	char		relkind;
-	TransactionId xid;			/* xact in which temp tab was created */
+	/*
+	 * If this entry was created during this xact, it should be deleted
+	 * at xact abort.  Conversely, if this entry was deleted during this
+	 * xact, it should be removed at xact commit.  We leave deleted entries
+	 * in the list until commit so that we can roll back if needed ---
+	 * but we ignore them for purposes of lookup!
+	 */
+	bool		created_in_cur_xact;
+	bool		deleted_in_cur_xact;
 } TempTable;
 
 
@@ -71,26 +79,122 @@ create_temp_relation(const char *relname, HeapTuple pg_class_tuple)
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 
 	temp_rel = (TempTable *) palloc(sizeof(TempTable));
-	temp_rel->user_relname = (char *) palloc(NAMEDATALEN);
-	temp_rel->relname = (char *) palloc(NAMEDATALEN);
 
-	StrNCpy(temp_rel->user_relname, relname, NAMEDATALEN);
-	StrNCpy(temp_rel->relname, NameStr(pg_class_form->relname), NAMEDATALEN);
+	StrNCpy(NameStr(temp_rel->user_relname), relname,
+			NAMEDATALEN);
+	StrNCpy(NameStr(temp_rel->relname), NameStr(pg_class_form->relname),
+			NAMEDATALEN);
 	temp_rel->relid = pg_class_tuple->t_data->t_oid;
 	temp_rel->relkind = pg_class_form->relkind;
-	temp_rel->xid = GetCurrentTransactionId();
+	temp_rel->created_in_cur_xact = true;
+	temp_rel->deleted_in_cur_xact = false;
 
 	temp_rels = lcons(temp_rel, temp_rels);
 
 	MemoryContextSwitchTo(oldcxt);
 }
 
+/*
+ * Remove a temp relation map entry (part of DROP TABLE on a temp table).
+ * We don't actually remove the entry, just mark it dead.
+ *
+ * We don't have the relname for indexes, so we just pass the oid.
+ */
+void
+remove_temp_rel_by_relid(Oid relid)
+{
+	List	   *l;
+
+	foreach(l, temp_rels)
+	{
+		TempTable  *temp_rel = (TempTable *) lfirst(l);
+
+		if (temp_rel->relid == relid)
+			temp_rel->deleted_in_cur_xact = true;
+		/* Keep scanning 'cause there could be multiple matches; see RENAME */
+	}
+}
+
+/*
+ * To implement ALTER TABLE RENAME on a temp table, we shouldn't touch
+ * the underlying physical table at all, just change the map entry!
+ *
+ * This routine is invoked early in ALTER TABLE RENAME to check for
+ * the temp-table case.  If oldname matches a temp table name, change
+ * the mapping to the new logical name and return TRUE (or elog if
+ * there is a conflict with another temp table name).  If there is
+ * no match, return FALSE indicating that normal rename should proceed.
+ *
+ * We also reject an attempt to rename a normal table to a name in use
+ * as a temp table name.  That would fail later on anyway when rename.c
+ * looks for a rename conflict, but we can give a more specific error
+ * message for the problem here.
+ *
+ * It might seem that we need to check for attempts to rename the physical
+ * file underlying a temp table, but that'll be rejected anyway because
+ * pg_tempXXX looks like a system table name.
+ */
+bool
+rename_temp_relation(const char *oldname,
+					 const char *newname)
+{
+	List	   *l;
+
+	foreach(l, temp_rels)
+	{
+		TempTable  *temp_rel = (TempTable *) lfirst(l);
+		MemoryContext oldcxt;
+		TempTable  *new_temp_rel;
+
+		if (temp_rel->deleted_in_cur_xact)
+			continue;			/* ignore it if logically deleted */
+
+		if (strcmp(NameStr(temp_rel->user_relname), oldname) != 0)
+			continue;			/* ignore non-matching entries */
+
+		/* We are renaming a temp table --- is it OK to do so? */
+		if (get_temp_rel_by_username(newname) != NULL)
+			elog(ERROR, "Cannot rename temp table \"%s\": temp table \"%s\" already exists",
+				 oldname, newname);
+
+		/*
+		 * Create a new mapping entry and mark the old one deleted in this
+		 * xact.  One of these entries will be deleted at xact end.
+		 */
+		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+
+		new_temp_rel = (TempTable *) palloc(sizeof(TempTable));
+		memcpy(new_temp_rel, temp_rel, sizeof(TempTable));
+
+		StrNCpy(NameStr(new_temp_rel->user_relname), newname, NAMEDATALEN);
+		new_temp_rel->created_in_cur_xact = true;
+
+		temp_rels = lcons(new_temp_rel, temp_rels);
+
+		temp_rel->deleted_in_cur_xact = true;
+
+		MemoryContextSwitchTo(oldcxt);
+
+		return true;
+	}
+
+	/* Old name does not match any temp table name, what about new? */
+	if (get_temp_rel_by_username(newname) != NULL)
+		elog(ERROR, "Cannot rename \"%s\" to \"%s\": a temp table by that name already exists",
+			 oldname, newname);
+
+	return false;
+}
+
+
 /*
  * Remove underlying relations for all temp rels at backend shutdown.
  */
 void
 remove_all_temp_relations(void)
 {
+	List	   *l;
+
 	/* skip xact start overhead if nothing to do */
 	if (temp_rels == NIL)
 		return;
@@ -99,21 +203,24 @@ remove_all_temp_relations(void)
 	StartTransactionCommand();
 
 	/*
-	 * The way this works is that each time through the loop, we delete
-	 * the frontmost entry.  The DROP will call remove_temp_rel_by_relid()
-	 * as a side effect, thereby removing the entry in the temp_rels list.
-	 * So this is not an infinite loop, even though it looks like one.
+	 * Scan the list and delete all entries not already deleted.
+	 * We need not worry about list entries getting deleted from under us,
+	 * because remove_temp_rel_by_relid() doesn't remove entries, only
+	 * mark them dead.
 	 */
-	while (temp_rels != NIL)
+	foreach(l, temp_rels)
 	{
-		TempTable  *temp_rel = (TempTable *) lfirst(temp_rels);
+		TempTable  *temp_rel = (TempTable *) lfirst(l);
+
+		if (temp_rel->deleted_in_cur_xact)
+			continue;			/* ignore it if deleted already */
 
 		if (temp_rel->relkind != RELKIND_INDEX)
 		{
 			char		relname[NAMEDATALEN];
 
 			/* safe from deallocation */
-			strcpy(relname, temp_rel->user_relname);
+			strcpy(relname, NameStr(temp_rel->user_relname));
 			heap_drop_with_catalog(relname, allowSystemTableMods);
 		}
 		else
@@ -126,79 +233,30 @@ remove_all_temp_relations(void)
 }
 
 /*
- * Remove a temp relation map entry (part of DROP TABLE on a temp table)
+ * Clean up temprel mapping entries during transaction commit or abort.
  *
- * we don't have the relname for indexes, so we just pass the oid
- */
-void
-remove_temp_rel_by_relid(Oid relid)
-{
-	MemoryContext oldcxt;
-	List	   *l,
-			   *prev;
-
-	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
-
-	prev = NIL;
-	l = temp_rels;
-	while (l != NIL)
-	{
-		TempTable  *temp_rel = (TempTable *) lfirst(l);
-
-		if (temp_rel->relid == relid)
-		{
-			pfree(temp_rel->user_relname);
-			pfree(temp_rel->relname);
-			pfree(temp_rel);
-			/* remove from linked list */
-			if (prev != NIL)
-			{
-				lnext(prev) = lnext(l);
-				pfree(l);
-				l = lnext(prev);
-			}
-			else
-			{
-				temp_rels = lnext(l);
-				pfree(l);
-				l = temp_rels;
-			}
-		}
-		else
-		{
-			prev = l;
-			l = lnext(l);
-		}
-	}
-
-	MemoryContextSwitchTo(oldcxt);
-}
-
-/*
- * Remove freshly-created map entries during transaction abort.
+ * During commit, remove entries that were deleted during this transaction;
+ * during abort, remove those created during this transaction.
  *
- * The underlying physical rel will be removed by normal abort processing.
- * We just have to delete the map entry.
+ * We do not need to worry about removing the underlying physical relation;
+ * that's someone else's job.
  */
 void
-remove_temp_rel_in_myxid(void)
+AtEOXact_temp_relations(bool isCommit)
 {
-	MemoryContext oldcxt;
 	List	   *l,
 			   *prev;
 
-	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
-
 	prev = NIL;
 	l = temp_rels;
 	while (l != NIL)
 	{
 		TempTable  *temp_rel = (TempTable *) lfirst(l);
 
-		if (temp_rel->xid == GetCurrentTransactionId())
+		if (isCommit ? temp_rel->deleted_in_cur_xact :
+			temp_rel->created_in_cur_xact)
 		{
-			pfree(temp_rel->user_relname);
-			pfree(temp_rel->relname);
+			/* This entry must be removed */
 			pfree(temp_rel);
 			/* remove from linked list */
 			if (prev != NIL)
@@ -216,65 +274,13 @@ remove_temp_rel_in_myxid(void)
 		}
 		else
 		{
+			/* This entry must be preserved */
+			temp_rel->created_in_cur_xact = false;
+			temp_rel->deleted_in_cur_xact = false;
 			prev = l;
 			l = lnext(l);
 		}
 	}
-
-	MemoryContextSwitchTo(oldcxt);
-}
-
-/*
- * To implement ALTER TABLE RENAME on a temp table, we shouldn't touch
- * the underlying physical table at all, just change the map entry!
- *
- * This routine is invoked early in ALTER TABLE RENAME to check for
- * the temp-table case.  If oldname matches a temp table name, change
- * the map entry to the new logical name and return TRUE (or elog if
- * there is a conflict with another temp table name).  If there is
- * no match, return FALSE indicating that normal rename should proceed.
- *
- * We also reject an attempt to rename a normal table to a name in use
- * as a temp table name.  That would fail later on anyway when rename.c
- * looks for a rename conflict, but we can give a more specific error
- * message for the problem here.
- *
- * It might seem that we need to check for attempts to rename the physical
- * file underlying a temp table, but that'll be rejected anyway because
- * pg_tempXXX looks like a system table name.
- *
- * A nitpicker might complain that the rename should be undone if the
- * current xact is later aborted, but I'm not going to fix that now.
- * This whole mapping mechanism ought to be replaced with something
- * schema-based, anyhow.
- */
-bool
-rename_temp_relation(const char *oldname,
-					 const char *newname)
-{
-	List	   *l;
-
-	foreach(l, temp_rels)
-	{
-		TempTable  *temp_rel = (TempTable *) lfirst(l);
-
-		if (strcmp(temp_rel->user_relname, oldname) == 0)
-		{
-			if (get_temp_rel_by_username(newname) != NULL)
-				elog(ERROR, "Cannot rename temp table \"%s\": temp table \"%s\" already exists",
-					 oldname, newname);
-			/* user_relname was palloc'd NAMEDATALEN, so safe to re-use it */
-			StrNCpy(temp_rel->user_relname, newname, NAMEDATALEN);
-			return true;
-		}
-	}
-
-	/* Old name does not match any temp table name, what about new? */
-	if (get_temp_rel_by_username(newname) != NULL)
-		elog(ERROR, "Cannot rename \"%s\" to \"%s\": a temp table by that name already exists",
-			 oldname, newname);
-
-	return false;
 }
 
 
@@ -292,8 +298,11 @@ get_temp_rel_by_username(const char *user_relname)
 	{
 		TempTable  *temp_rel = (TempTable *) lfirst(l);
 
-		if (strcmp(temp_rel->user_relname, user_relname) == 0)
-			return temp_rel->relname;
+		if (temp_rel->deleted_in_cur_xact)
+			continue;			/* ignore it if logically deleted */
+
+		if (strcmp(NameStr(temp_rel->user_relname), user_relname) == 0)
+			return NameStr(temp_rel->relname);
 	}
 	return NULL;
 }
@@ -310,8 +319,11 @@ get_temp_rel_by_physicalname(const char *relname)
 	{
 		TempTable  *temp_rel = (TempTable *) lfirst(l);
 
-		if (strcmp(temp_rel->relname, relname) == 0)
-			return temp_rel->user_relname;
+		if (temp_rel->deleted_in_cur_xact)
+			continue;			/* ignore it if logically deleted */
+
+		if (strcmp(NameStr(temp_rel->relname), relname) == 0)
+			return NameStr(temp_rel->user_relname);
 	}
 	/* needed for bootstrapping temp tables */
 	return pstrdup(relname);
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index 409e103f4d..4cc317492f 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: heap.h,v 1.31 2000/07/04 06:11:54 tgl Exp $
+ * $Id: heap.h,v 1.32 2000/11/08 22:10:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -29,7 +29,7 @@ extern Relation heap_create(char *relname, TupleDesc tupDesc,
 							bool istemp, bool storage_create,
 							bool allow_system_table_mods);
 
-extern bool heap_storage_create(Relation rel);
+extern void heap_storage_create(Relation rel);
 
 extern Oid heap_create_with_catalog(char *relname, TupleDesc tupdesc,
 									char relkind, bool istemp,
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index e00b25e6f0..967bffb4aa 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: index.h,v 1.29 2000/07/14 22:17:56 tgl Exp $
+ * $Id: index.h,v 1.30 2000/11/08 22:10:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,9 +46,9 @@ extern void FormIndexDatum(IndexInfo *indexInfo,
 						   Datum *datum,
 						   char *nullv);
 
-extern void UpdateStats(Oid relid, long reltuples, bool inplace);
+extern void UpdateStats(Oid relid, long reltuples);
 extern bool IndexesAreActive(Oid relid, bool comfirmCommitted);
-extern void setRelhasindexInplace(Oid relid, bool hasindex, bool immediate);
+extern void setRelhasindex(Oid relid, bool hasindex);
 extern bool SetReindexProcessing(bool processing);
 extern bool IsReindexProcessing(void);
 
diff --git a/src/include/parser/parse_clause.h b/src/include/parser/parse_clause.h
index fd1cfdb360..421156ac21 100644
--- a/src/include/parser/parse_clause.h
+++ b/src/include/parser/parse_clause.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: parse_clause.h,v 1.19 2000/09/12 21:07:12 tgl Exp $
+ * $Id: parse_clause.h,v 1.20 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,6 +17,7 @@
 #include "parser/parse_node.h"
 
 extern void makeRangeTable(ParseState *pstate, List *frmList);
+extern void lockTargetTable(ParseState *pstate, char *relname);
 extern void setTargetTable(ParseState *pstate, char *relname,
 						   bool inh, bool inJoinSet);
 extern Node *transformWhereClause(ParseState *pstate, Node *where);
diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h
index 22dd797c7f..a0a41c3289 100644
--- a/src/include/parser/parse_node.h
+++ b/src/include/parser/parse_node.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: parse_node.h,v 1.22 2000/09/29 18:21:40 tgl Exp $
+ * $Id: parse_node.h,v 1.23 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,7 @@ typedef struct ParseState
 	List	   *p_joinlist;		/* join items so far (will become
 								 * FromExpr node's fromlist) */
 	int			p_last_resno;	/* last targetlist resno assigned */
+	List	   *p_forUpdate;	/* FOR UPDATE clause, if any (see gram.y) */
 	bool		p_hasAggs;
 	bool		p_hasSubLinks;
 	bool		p_is_insert;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 80aca7c57e..fc15e59859 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: buf_internals.h,v 1.42 2000/10/28 16:21:00 vadim Exp $
+ * $Id: buf_internals.h,v 1.43 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -200,7 +200,7 @@ extern int	NLocBuffer;
 extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
 				 bool *foundPtr);
 extern int	WriteLocalBuffer(Buffer buffer, bool release);
-extern int	FlushLocalBuffer(Buffer buffer, bool release);
+extern int	FlushLocalBuffer(Buffer buffer, bool sync, bool release);
 extern void InitLocalBuffer(void);
 extern void LocalBufferSync(void);
 extern void ResetLocalBufferPool(void);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 0ed4837305..22c0ccde7d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,15 +7,16 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: bufmgr.h,v 1.42 2000/10/28 16:21:00 vadim Exp $
+ * $Id: bufmgr.h,v 1.43 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef BUFMGR_H
 #define BUFMGR_H
 
-#include "storage/buf_internals.h"
 #include "access/xlogdefs.h"
+#include "storage/buf_internals.h"
+#include "storage/relfilenode.h"
 
 typedef void *Block;
 
@@ -151,7 +152,7 @@ extern int	WriteBuffer(Buffer buffer);
 extern int	WriteNoReleaseBuffer(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 					 BlockNumber blockNum);
-extern int	FlushBuffer(Buffer buffer, bool release);
+extern int	FlushBuffer(Buffer buffer, bool sync, bool release);
 
 extern void InitBufferPool(IPCKey key);
 extern void PrintBufferUsage(FILE *statfp);
@@ -162,7 +163,8 @@ extern void FlushBufferPool(void);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
 extern int	FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
-extern void ReleaseRelationBuffers(Relation rel);
+extern void DropRelationBuffers(Relation rel);
+extern void DropRelFileNodeBuffers(RelFileNode rnode);
 extern void DropBuffers(Oid dbid);
 extern void PrintPinnedBufs(void);
 extern int	BufferShmemSize(void);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 49a2e3e5e9..99eed75fe6 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: smgr.h,v 1.23 2000/10/28 16:21:00 vadim Exp $
+ * $Id: smgr.h,v 1.24 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,7 +28,7 @@ extern int	smgrinit(void);
 extern int	smgrcreate(int16 which, Relation reln);
 extern int	smgrunlink(int16 which, Relation reln);
 extern int	smgrextend(int16 which, Relation reln, char *buffer);
-extern int	smgropen(int16 which, Relation reln);
+extern int	smgropen(int16 which, Relation reln, bool failOK);
 extern int	smgrclose(int16 which, Relation reln);
 extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
 		 char *buffer);
@@ -43,6 +43,7 @@ extern int smgrblindmarkdirty(int16 which, RelFileNode rnode,
 extern int	smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno);
 extern int	smgrnblocks(int16 which, Relation reln);
 extern int	smgrtruncate(int16 which, Relation reln, int nblocks);
+extern int	smgrDoPendingDeletes(bool isCommit);
 extern int	smgrcommit(void);
 extern int	smgrabort(void);
 
@@ -56,7 +57,7 @@ extern int	smgrsync(void);
 /* in md.c */
 extern int	mdinit(void);
 extern int	mdcreate(Relation reln);
-extern int	mdunlink(Relation reln);
+extern int	mdunlink(RelFileNode rnode);
 extern int	mdextend(Relation reln, char *buffer);
 extern int	mdopen(Relation reln);
 extern int	mdclose(Relation reln);
@@ -64,9 +65,9 @@ extern int	mdread(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mdflush(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mdmarkdirty(Relation reln, BlockNumber blkno);
-extern int mdblindwrt(RelFileNode rnode, BlockNumber blkno,
-						char *buffer, bool dofsync);
-extern int mdblindmarkdirty(RelFileNode rnode, BlockNumber blkno);
+extern int	mdblindwrt(RelFileNode rnode, BlockNumber blkno,
+					   char *buffer, bool dofsync);
+extern int	mdblindmarkdirty(RelFileNode rnode, BlockNumber blkno);
 extern int	mdnblocks(Relation reln);
 extern int	mdtruncate(Relation reln, int nblocks);
 extern int	mdcommit(void);
@@ -81,7 +82,7 @@ extern SPINLOCK MMCacheLock;
 
 extern int	mminit(void);
 extern int	mmcreate(Relation reln);
-extern int	mmunlink(Relation reln);
+extern int	mmunlink(RelFileNode rnode);
 extern int	mmextend(Relation reln, char *buffer);
 extern int	mmopen(Relation reln);
 extern int	mmclose(Relation reln);
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index a585152555..b2ccee3adf 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: inval.h,v 1.17 2000/06/08 19:51:06 momjian Exp $
+ * $Id: inval.h,v 1.18 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,4 @@ extern void RelationInvalidateHeapTuple(Relation relation, HeapTuple tuple);
 
 extern void RelationMark4RollbackHeapTuple(Relation relation, HeapTuple tuple);
 
-extern void ImmediateInvalidateSharedHeapTuple(Relation relation, HeapTuple tuple);
-
 #endif	 /* INVAL_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 4deec0618a..fd4012b0dd 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: rel.h,v 1.41 2000/09/07 09:58:38 vadim Exp $
+ * $Id: rel.h,v 1.42 2000/11/08 22:10:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -92,7 +92,6 @@ typedef struct RelationData
 	uint16		rd_refcnt;		/* reference count */
 	bool		rd_myxactonly;	/* rel uses the local buffer mgr */
 	bool		rd_isnailed;	/* rel is nailed in cache */
-	bool		rd_unlinked;	/* rel already unlinked or not created yet */
 	bool		rd_indexfound;	/* true if rd_indexlist is valid */
 	bool		rd_uniqueindex;	/* true if rel is a UNIQUE index */
 	Form_pg_am	rd_am;			/* AM tuple */
diff --git a/src/include/utils/temprel.h b/src/include/utils/temprel.h
index a99839ac26..789d505878 100644
--- a/src/include/utils/temprel.h
+++ b/src/include/utils/temprel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: temprel.h,v 1.11 2000/10/11 21:28:19 momjian Exp $
+ * $Id: temprel.h,v 1.12 2000/11/08 22:10:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,7 +23,7 @@ extern bool rename_temp_relation(const char *oldname,
 								 const char *newname);
 
 extern void remove_all_temp_relations(void);
-extern void remove_temp_rel_in_myxid(void);
+extern void AtEOXact_temp_relations(bool isCommit);
 
 extern char *get_temp_rel_by_username(const char *user_relname);
 extern char *get_temp_rel_by_physicalname(const char *relname);
diff --git a/src/test/regress/expected/errors.out b/src/test/regress/expected/errors.out
index c59ba0817c..b3396b5ec6 100644
--- a/src/test/regress/expected/errors.out
+++ b/src/test/regress/expected/errors.out
@@ -62,10 +62,10 @@ alter table rename;
 ERROR:  parser: parse error at or near ";"
 -- no such relation 
 alter table nonesuch rename to newnonesuch;
-ERROR:  Relation 'nonesuch' does not exist
+ERROR:  Relation "nonesuch" does not exist
 -- no such relation 
 alter table nonesuch rename to stud_emp;
-ERROR:  Relation 'nonesuch' does not exist
+ERROR:  Relation "nonesuch" does not exist
 -- system relation 
 alter table stud_emp rename to pg_stud_emp;
 ERROR:  renamerel: Illegal class name: "pg_stud_emp" -- pg_ is reserved for system catalogs