thereby saving a visit to the metapage in most index searches/updates.
This wouldn't actually save any I/O (since in the old regime the metapage
generally stayed in cache anyway), but it does provide a useful decrease
in bufmgr traffic in high-contention scenarios. Per my recent proposal.
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
Page zero of every btree is a meta-data page. This page stores the
location of the root page --- both the true root and the current effective
-root ("fast" root).
+root ("fast" root). To avoid fetching the metapage for every single index
+search, we cache a copy of the meta-data information in the index's
+relcache entry (rd_amcache). This is a bit ticklish since using the cache
+implies following a root page pointer that could be stale. We require
+every metapage update to send out a SI "relcache inval" message on the
+index relation. That ensures that each backend will flush its cached copy
+not later than the start of its next transaction. Therefore, stale
+pointers cannot be used for longer than the current transaction, which
+reduces the problem to the same one already dealt with for concurrent
+VACUUM --- we can just imagine that each open transaction is potentially
+"already in flight" to the old root.
The algorithm assumes we can fit at least three items per page
(a "high key" and two real data items). Therefore it's unsafe
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "access/heapam.h"
#include "access/nbtree.h"
#include "miscadmin.h"
+#include "utils/inval.h"
typedef struct
END_CRIT_SECTION();
- /* release pin/lock */
+ /* release buffers; send out relcache inval if metapage changed */
if (BufferIsValid(metabuf))
+ {
+ CacheInvalidateRelcache(rel);
_bt_relbuf(rel, metabuf);
+ }
_bt_relbuf(rel, buf);
}
END_CRIT_SECTION();
+ /* send out relcache inval for metapage change */
+ CacheInvalidateRelcache(rel);
+
/* done with metapage */
_bt_relbuf(rel, metabuf);
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.95 2006/04/01 03:03:36 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
+#include "utils/inval.h"
/*
uint32 rootlevel;
BTMetaPageData *metad;
+ /*
+ * Try to use previously-cached metapage data to find the root. This
+ * normally saves one buffer access per index search, which is a very
+ * helpful savings in bufmgr traffic and hence contention.
+ */
+ if (rel->rd_amcache != NULL)
+ {
+ metad = (BTMetaPageData *) rel->rd_amcache;
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version == BTREE_VERSION);
+ Assert(metad->btm_root != P_NONE);
+
+ rootblkno = metad->btm_fastroot;
+ Assert(rootblkno != P_NONE);
+ rootlevel = metad->btm_fastlevel;
+
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ /*
+ * Since the cache might be stale, we check the page more carefully
+ * here than normal. We *must* check that it's not deleted.
+ * If it's not alone on its level, then we reject too --- this
+ * may be overly paranoid but better safe than sorry. Note we
+ * don't check P_ISROOT, because that's not set in a "fast root".
+ */
+ if (!P_IGNORE(rootopaque) &&
+ rootopaque->btpo.level == rootlevel &&
+ P_LEFTMOST(rootopaque) &&
+ P_RIGHTMOST(rootopaque))
+ {
+ /* OK, accept cached page as the root */
+ return rootbuf;
+ }
+ _bt_relbuf(rel, rootbuf);
+ /* Cache is stale, throw it away */
+ if (rel->rd_amcache)
+ pfree(rel->rd_amcache);
+ rel->rd_amcache = NULL;
+ }
+
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
END_CRIT_SECTION();
+ /*
+ * Send out relcache inval for metapage change (probably unnecessary
+ * here, but let's be safe).
+ */
+ CacheInvalidateRelcache(rel);
+
/*
* swap root write lock for read lock. There is no danger of anyone
* else accessing the new root page while it's unlocked, since no one
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
+ /*
+ * Cache the metapage data for next time
+ */
+ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(BTMetaPageData));
+ memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+
/*
* We are done with the metapage; arrange to release it via first
* _bt_relandgetbuf call
uint32 rootlevel;
BTMetaPageData *metad;
+ /*
+ * We don't try to use cached metapage data here, since (a) this path is
+ * not performance-critical, and (b) if we are here it suggests our cache
+ * is out-of-date anyway. In light of point (b), it's probably safest to
+ * actively flush any cached metapage info.
+ */
+ if (rel->rd_amcache)
+ pfree(rel->rd_amcache);
+ rel->rd_amcache = NULL;
+
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
END_CRIT_SECTION();
- /* release buffers */
+ /* release buffers; send out relcache inval if metapage changed */
if (BufferIsValid(metabuf))
+ {
+ CacheInvalidateRelcache(rel);
_bt_relbuf(rel, metabuf);
+ }
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, rbuf);
_bt_relbuf(rel, buf);
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.144 2006/04/01 03:03:37 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.145 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
+#include "utils/inval.h"
#include "utils/memutils.h"
}
#endif /* BTREE_BUILD_STATS */
+ /*
+ * If we are reindexing a pre-existing index, it is critical to send out
+ * a relcache invalidation SI message to ensure all backends re-read the
+ * index metapage. In most circumstances the update-stats operation will
+ * cause that to happen, but at the moment there are corner cases where
+ * no pg_class update will occur, so force an inval here. XXX FIXME:
+ * the upper levels of CREATE INDEX should handle the stats update as
+ * well as guaranteeing relcache inval.
+ */
+ CacheInvalidateRelcache(index);
+
/* since we just counted the # of tuples, may as well update stats */
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.238 2006/03/05 15:58:45 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.239 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
*/
relation->rd_indexprs = NIL;
relation->rd_indpred = NIL;
+ relation->rd_amcache = NULL;
}
/*
RelationInitPhysicalAddr(relation);
/* Make sure targblock is reset in case rel was truncated */
relation->rd_targblock = InvalidBlockNumber;
+ /* Must free any AM cached data, too */
+ if (relation->rd_amcache)
+ pfree(relation->rd_amcache);
+ relation->rd_amcache = NULL;
/* Okay, now it's valid again */
relation->rd_isvalid = true;
}
rel->rd_indexlist = NIL;
rel->rd_oidindex = InvalidOid;
rel->rd_createSubid = InvalidSubTransactionId;
+ rel->rd_amcache = NULL;
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
/*
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.88 2006/03/05 15:59:07 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.89 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* cached, namely those with subtype zero. The arrays are indexed by
* strategy or support number, which is a sufficient identifier given that
* restriction.
+ *
+ * Note: rd_amcache is available for index AMs to cache private data about
+ * an index. This must be just a cache since it may get reset at any time
+ * (in particular, it will get reset by a relcache inval message for the
+ * index). If used, it must point to a single memory chunk palloc'd in
+ * rd_indexcxt. A relcache reset will include freeing that chunk and
+ * setting rd_amcache = NULL.
*/
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
RelationAmInfo *rd_aminfo; /* lookup info for funcs found in pg_am */
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
List *rd_indexprs; /* index expression trees, if any */
List *rd_indpred; /* index predicate tree, if any */
+ void *rd_amcache; /* available for use by index AM */
/* statistics collection area */
PgStat_Info pgstat_info;