<!--
Documentation of the system catalogs, directed toward PostgreSQL developers
- $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.65 2003/01/19 00:13:28 momjian Exp $
+ $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.66 2003/02/22 00:45:03 tgl Exp $
-->
<chapter id="catalogs">
<entry>bulk-delete function</entry>
</row>
+ <row>
+ <entry>amvacuumcleanup</entry>
+ <entry><type>regproc</type></entry>
+ <entry>pg_proc.oid</entry>
+ <entry>post-VACUUM cleanup function</entry>
+ </row>
+
<row>
<entry>amcostestimate</entry>
<entry><type>regproc</type></entry>
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.99 2002/11/13 00:39:46 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.100 2003/02/22 00:45:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
result->num_pages = num_pages;
- result->tuples_removed = tuples_removed;
result->num_index_tuples = num_index_tuples;
+ result->tuples_removed = tuples_removed;
+ result->pages_free = 0;
PG_RETURN_POINTER(result);
}
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.60 2002/09/04 20:31:09 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.61 2003/02/22 00:45:03 tgl Exp $
*
* NOTES
* This file contains only the public interface routines.
result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
result->num_pages = num_pages;
- result->tuples_removed = tuples_removed;
result->num_index_tuples = num_index_tuples;
+ result->tuples_removed = tuples_removed;
+ result->pages_free = 0;
PG_RETURN_POINTER(result);
}
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.63 2003/01/08 19:41:40 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.64 2003/02/22 00:45:03 tgl Exp $
*
* INTERFACE ROUTINES
* index_open - open an index relation by relation OID
* index_restrpos - restore a scan position
* index_getnext - get the next tuple from a scan
* index_bulk_delete - bulk deletion of index tuples
+ * index_vacuum_cleanup - post-deletion cleanup of an index
* index_cost_estimator - fetch amcostestimate procedure OID
* index_getprocid - get a support procedure OID
*
return result;
}
+/* ----------------
+ * index_vacuum_cleanup - do post-deletion cleanup of an index
+ *
+ * return value is an optional palloc'd struct of statistics
+ * ----------------
+ */
+IndexBulkDeleteResult *
+index_vacuum_cleanup(Relation indexRelation,
+ IndexVacuumCleanupInfo *info,
+ IndexBulkDeleteResult *stats)
+{
+ RegProcedure procedure;
+ IndexBulkDeleteResult *result;
+
+ RELATION_CHECKS;
+
+ /* It's okay for an index AM not to have a vacuumcleanup procedure */
+ if (!RegProcedureIsValid(indexRelation->rd_am->amvacuumcleanup))
+ return stats;
+
+ GET_REL_PROCEDURE(vacuum_cleanup, amvacuumcleanup);
+
+ result = (IndexBulkDeleteResult *)
+ DatumGetPointer(OidFunctionCall3(procedure,
+ PointerGetDatum(indexRelation),
+ PointerGetDatum((Pointer) info),
+ PointerGetDatum((Pointer) stats)));
+
+ return result;
+}
+
/* ----------------
* index_cost_estimator
*
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.98 2003/02/22 00:45:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
if (!_bt_isequal(itupdesc, page, P_HIKEY,
natts, itup_scankey))
break;
- nblkno = opaque->btpo_next;
- if (nbuf != InvalidBuffer)
- _bt_relbuf(rel, nbuf);
- nbuf = _bt_getbuf(rel, nblkno, BT_READ);
- page = BufferGetPage(nbuf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* Advance to next non-dead page --- there must be one */
+ for (;;)
+ {
+ nblkno = opaque->btpo_next;
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf);
+ nbuf = _bt_getbuf(rel, nblkno, BT_READ);
+ page = BufferGetPage(nbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
+ break;
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "_bt_check_unique: fell off the end of %s",
+ RelationGetRelationName(rel));
+ }
maxoff = PageGetMaxOffsetNumber(page);
offset = P_FIRSTDATAKEY(opaque);
}
_bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
random() > (MAX_RANDOM_VALUE / 100))
{
- /* step right one page */
- BlockNumber rblkno = lpageop->btpo_next;
- Buffer rbuf;
-
/*
- * must write-lock next page before releasing write lock on
+ * step right to next non-dead page
+ *
+ * must write-lock that page before releasing write lock on
* current page; else someone else's _bt_check_unique scan
- * could fail to see our insertion.
+ * could fail to see our insertion. write locks on intermediate
+ * dead pages won't do because we don't know when they will get
+ * de-linked from the tree.
*/
- rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+ Buffer rbuf = InvalidBuffer;
+
+ for (;;)
+ {
+ BlockNumber rblkno = lpageop->btpo_next;
+
+ if (rbuf != InvalidBuffer)
+ _bt_relbuf(rel, rbuf);
+ rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+ page = BufferGetPage(rbuf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(lpageop))
+ break;
+ if (P_RIGHTMOST(lpageop))
+ elog(ERROR, "_bt_insertonpg: fell off the end of %s",
+ RelationGetRelationName(rel));
+ }
_bt_relbuf(rel, buf);
buf = rbuf;
- page = BufferGetPage(buf);
- lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
movedright = true;
}
BTPageOpaque ropaque,
lopaque,
oopaque;
- Buffer sbuf = 0;
- Page spage = 0;
+ Buffer sbuf = InvalidBuffer;
+ Page spage = NULL;
+ BTPageOpaque sopaque = NULL;
Size itemsz;
ItemId itemid;
BTItem item;
{
sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
+ sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+ if (sopaque->btpo_prev != ropaque->btpo_prev)
+ elog(PANIC, "btree: right sibling's left-link doesn't match");
}
/*
*/
START_CRIT_SECTION();
+ if (!P_RIGHTMOST(ropaque))
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
/* XLOG stuff */
if (!rel->rd_istemp)
{
if (!P_RIGHTMOST(ropaque))
{
- BTPageOpaque sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
-
- sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
-
rdata[2].next = &(rdata[3]);
rdata[3].buffer = sbuf;
rdata[3].data = NULL;
Buffer buf;
Page page;
BTPageOpaque opaque;
- OffsetNumber offnum,
- minoff,
- maxoff;
- ItemId itemid;
- BTItem item;
buf = _bt_getbuf(rel, blkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- minoff = P_FIRSTDATAKEY(opaque);
- maxoff = PageGetMaxOffsetNumber(page);
- /*
- * start = InvalidOffsetNumber means "search the whole page".
- * We need this test anyway due to possibility that
- * page has a high key now when it didn't before.
- */
- if (start < minoff)
- start = minoff;
-
- /*
- * These loops will check every item on the page --- but in an order
- * that's attuned to the probability of where it actually is. Scan
- * to the right first, then to the left.
- */
- for (offnum = start;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum))
+ if (!P_IGNORE(opaque))
{
- itemid = PageGetItemId(page, offnum);
- item = (BTItem) PageGetItem(page, itemid);
- if (BTItemSame(item, &stack->bts_btitem))
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ ItemId itemid;
+ BTItem item;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * start = InvalidOffsetNumber means "search the whole page".
+ * We need this test anyway due to possibility that
+ * page has a high key now when it didn't before.
+ */
+ if (start < minoff)
+ start = minoff;
+
+ /*
+ * These loops will check every item on the page --- but in an
+ * order that's attuned to the probability of where it actually
+ * is. Scan to the right first, then to the left.
+ */
+ for (offnum = start;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
{
- /* Return accurate pointer to where link is now */
- stack->bts_blkno = blkno;
- stack->bts_offset = offnum;
- return buf;
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (BTItemSame(item, &stack->bts_btitem))
+ {
+ /* Return accurate pointer to where link is now */
+ stack->bts_blkno = blkno;
+ stack->bts_offset = offnum;
+ return buf;
+ }
}
- }
- for (offnum = OffsetNumberPrev(start);
- offnum >= minoff;
- offnum = OffsetNumberPrev(offnum))
- {
- itemid = PageGetItemId(page, offnum);
- item = (BTItem) PageGetItem(page, itemid);
- if (BTItemSame(item, &stack->bts_btitem))
+ for (offnum = OffsetNumberPrev(start);
+ offnum >= minoff;
+ offnum = OffsetNumberPrev(offnum))
{
- /* Return accurate pointer to where link is now */
- stack->bts_blkno = blkno;
- stack->bts_offset = offnum;
- return buf;
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (BTItemSame(item, &stack->bts_btitem))
+ {
+ /* Return accurate pointer to where link is now */
+ stack->bts_blkno = blkno;
+ stack->bts_offset = offnum;
+ return buf;
+ }
}
}
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rootpage = BufferGetPage(rootbuf);
rootblknum = BufferGetBlockNumber(rootbuf);
+
+ /* acquire lock on the metapage */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.60 2003/02/22 00:45:04 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
*/
#include "postgres.h"
-#include <time.h>
-
#include "access/nbtree.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
-extern bool FixBTree; /* comments in nbtree.c */
-extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
-
-/*
- * We use high-concurrency locking on btrees. There are two cases in
- * which we don't do locking. One is when we're building the btree.
- * Since the creating transaction has not committed, no one can see
- * the index, and there's no reason to share locks. The second case
- * is when we're just starting up the database system. We use some
- * special-purpose initialization code in the relation cache manager
- * (see utils/cache/relcache.c) to allow us to do indexed scans on
- * the system catalogs before we'd normally be able to. This happens
- * before the lock table is fully initialized, so we can't use it.
- * Strictly speaking, this violates 2pl, but we don't do 2pl on the
- * system catalogs anyway, so I declare this to be okay.
- */
-
-#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
-
/*
* _bt_metapinit() -- Initialize the metadata page of a new btree.
+ *
+ * Note: there's no real need for any locking here. Since the transaction
+ * creating the index hasn't committed yet, no one else can even see the index
+ * much less be trying to use it.
*/
void
_bt_metapinit(Relation rel)
BTMetaPageData *metad;
BTPageOpaque op;
- /* can't be sharing this with anyone, now... */
- if (USELOCKING)
- LockRelation(rel, AccessExclusiveLock);
-
if (RelationGetNumberOfBlocks(rel) != 0)
elog(ERROR, "Cannot initialize non-empty btree %s",
RelationGetRelationName(rel));
END_CRIT_SECTION();
WriteBuffer(buf);
-
- /* all done */
- if (USELOCKING)
- UnlockRelation(rel, AccessExclusiveLock);
}
/*
* what we will return is the old root, which is now just the leftmost
* page on a probably-not-very-wide level. For most purposes this is
* as good as or better than the true root, so we do not bother to
- * insist on finding the true root.
+ * insist on finding the true root. We do, however, guarantee to
+ * return a live (not deleted or half-dead) page.
*
* On successful return, the root page is pinned and read-locked.
* The metadata page is not locked or pinned on exit.
Page rootpage;
BTPageOpaque rootopaque;
BlockNumber rootblkno;
+ uint32 rootlevel;
BTMetaPageData *metad;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
+ /* sanity-check the metapage */
if (!(metaopaque->btpo_flags & BTP_META) ||
metad->btm_magic != BTREE_MAGIC)
elog(ERROR, "Index %s is not a btree",
/*
* Race condition: if someone else initialized the metadata
* between the time we released the read lock and acquired the
- * write lock, above, we must avoid doing it again.
+ * write lock, we must avoid doing it again.
*/
- if (metad->btm_root == P_NONE)
+ if (metad->btm_root != P_NONE)
{
/*
- * Get, initialize, write, and leave a lock of the appropriate
- * type on the new root page. Since this is the first page in
- * the tree, it's a leaf as well as the root.
+ * Metadata initialized by someone else. In order to
+ * guarantee no deadlocks, we have to release the metadata
+ * page and start all over again. (Is that really true?
+ * But it's hardly worth trying to optimize this case.)
*/
- rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
- rootblkno = BufferGetBlockNumber(rootbuf);
- rootpage = BufferGetPage(rootbuf);
-
- _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
- rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
- rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
- rootopaque->btpo.level = 0;
-
- /* NO ELOG(ERROR) till meta is updated */
- START_CRIT_SECTION();
-
- metad->btm_root = rootblkno;
- metad->btm_level = 0;
- metad->btm_fastroot = rootblkno;
- metad->btm_fastlevel = 0;
+ _bt_relbuf(rel, metabuf);
+ return _bt_getroot(rel, access);
+ }
- /* XLOG stuff */
- if (!rel->rd_istemp)
- {
- xl_btree_newroot xlrec;
- XLogRecPtr recptr;
- XLogRecData rdata;
+ /*
+ * Get, initialize, write, and leave a lock of the appropriate
+ * type on the new root page. Since this is the first page in
+ * the tree, it's a leaf as well as the root.
+ */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootblkno = BufferGetBlockNumber(rootbuf);
+ rootpage = BufferGetPage(rootbuf);
+
+ _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+ rootopaque->btpo.level = 0;
+
+ /* NO ELOG(ERROR) till meta is updated */
+ START_CRIT_SECTION();
+
+ metad->btm_root = rootblkno;
+ metad->btm_level = 0;
+ metad->btm_fastroot = rootblkno;
+ metad->btm_fastlevel = 0;
+
+ /* XLOG stuff */
+ if (!rel->rd_istemp)
+ {
+ xl_btree_newroot xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata;
- xlrec.node = rel->rd_node;
- xlrec.rootblk = rootblkno;
- xlrec.level = 0;
+ xlrec.node = rel->rd_node;
+ xlrec.rootblk = rootblkno;
+ xlrec.level = 0;
- rdata.buffer = InvalidBuffer;
- rdata.data = (char *) &xlrec;
- rdata.len = SizeOfBtreeNewroot;
- rdata.next = NULL;
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) &xlrec;
+ rdata.len = SizeOfBtreeNewroot;
+ rdata.next = NULL;
- recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
- PageSetLSN(rootpage, recptr);
- PageSetSUI(rootpage, ThisStartUpID);
- PageSetLSN(metapg, recptr);
- PageSetSUI(metapg, ThisStartUpID);
- }
+ PageSetLSN(rootpage, recptr);
+ PageSetSUI(rootpage, ThisStartUpID);
+ PageSetLSN(metapg, recptr);
+ PageSetSUI(metapg, ThisStartUpID);
+ }
- END_CRIT_SECTION();
+ END_CRIT_SECTION();
- _bt_wrtnorelbuf(rel, rootbuf);
+ _bt_wrtnorelbuf(rel, rootbuf);
- /*
- * swap root write lock for read lock. There is no danger of
- * anyone else accessing the new root page while it's unlocked,
- * since no one else knows where it is yet.
- */
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_READ);
+ /*
+ * swap root write lock for read lock. There is no danger of
+ * anyone else accessing the new root page while it's unlocked,
+ * since no one else knows where it is yet.
+ */
+ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(rootbuf, BT_READ);
- /* okay, metadata is correct, write and release it */
- _bt_wrtbuf(rel, metabuf);
- }
- else
- {
- /*
- * Metadata initialized by someone else. In order to
- * guarantee no deadlocks, we have to release the metadata
- * page and start all over again.
- */
- _bt_relbuf(rel, metabuf);
- return _bt_getroot(rel, access);
- }
+ /* okay, metadata is correct, write and release it */
+ _bt_wrtbuf(rel, metabuf);
}
else
{
rootblkno = metad->btm_fastroot;
+ Assert(rootblkno != P_NONE);
+ rootlevel = metad->btm_fastlevel;
_bt_relbuf(rel, metabuf); /* done with the meta page */
- rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ for (;;)
+ {
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ if (!P_IGNORE(rootopaque))
+ break;
+
+ /* it's dead, Jim. step right one page */
+ if (P_RIGHTMOST(rootopaque))
+ elog(ERROR, "No live root page found in %s",
+ RelationGetRelationName(rel));
+ rootblkno = rootopaque->btpo_next;
+
+ _bt_relbuf(rel, rootbuf);
+ }
+
+ /* Note: can't check btpo.level on deleted pages */
+ if (rootopaque->btpo.level != rootlevel)
+ elog(ERROR, "Root page %u of %s has level %u, expected %u",
+ rootblkno, RelationGetRelationName(rel),
+ rootopaque->btpo.level, rootlevel);
}
/*
Page metapg;
BTPageOpaque metaopaque;
Buffer rootbuf;
+ Page rootpage;
+ BTPageOpaque rootopaque;
BlockNumber rootblkno;
+ uint32 rootlevel;
BTMetaPageData *metad;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
}
rootblkno = metad->btm_root;
+ rootlevel = metad->btm_level;
_bt_relbuf(rel, metabuf); /* done with the meta page */
- rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ for (;;)
+ {
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ if (!P_IGNORE(rootopaque))
+ break;
+
+ /* it's dead, Jim. step right one page */
+ if (P_RIGHTMOST(rootopaque))
+ elog(ERROR, "No live root page found in %s",
+ RelationGetRelationName(rel));
+ rootblkno = rootopaque->btpo_next;
+
+ _bt_relbuf(rel, rootbuf);
+ }
+
+ /* Note: can't check btpo.level on deleted pages */
+ if (rootopaque->btpo.level != rootlevel)
+ elog(ERROR, "Root page %u of %s has level %u, expected %u",
+ rootblkno, RelationGetRelationName(rel),
+ rootopaque->btpo.level, rootlevel);
return rootbuf;
}
/*
* _bt_getbuf() -- Get a buffer by block number for read or write.
*
+ * blkno == P_NEW means to get an unallocated index page.
+ *
* When this routine returns, the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned").
}
else
{
+ bool needLock;
Page page;
+ /* XXX soon: ask FSM about free space */
+
/*
* Extend the relation by one page.
*
- * Extend bufmgr code is unclean and so we have to use extra locking
- * here.
+ * We have to use a lock to ensure no one else is extending the rel at
+ * the same time, else we will both try to initialize the same new
+ * page. We can skip locking for new or temp relations, however,
+ * since no one else could be accessing them.
*/
- LockPage(rel, 0, ExclusiveLock);
- buf = ReadBuffer(rel, blkno);
+ needLock = !(rel->rd_isnew || rel->rd_istemp);
+
+ if (needLock)
+ LockPage(rel, 0, ExclusiveLock);
+
+ buf = ReadBuffer(rel, P_NEW);
+
+ /*
+ * Release the file-extension lock; it's now OK for someone else to
+ * extend the relation some more.
+ */
+ if (needLock)
+ UnlockPage(rel, 0, ExclusiveLock);
+
+ /* Acquire appropriate buffer lock on new page */
LockBuffer(buf, access);
- UnlockPage(rel, 0, ExclusiveLock);
/* Initialize the new page before returning it */
page = BufferGetPage(buf);
* and a pin on the buffer.
*
* NOTE: actually, the buffer manager just marks the shared buffer page
- * dirty here, the real I/O happens later. Since we can't persuade the
- * Unix kernel to schedule disk writes in a particular order, there's not
- * much point in worrying about this. The most we can say is that all the
- * writes will occur before commit.
+ * dirty here; the real I/O happens later. This is okay since we are not
+ * relying on write ordering anyway. The WAL mechanism is responsible for
+ * guaranteeing correctness after a crash.
*/
void
_bt_wrtbuf(Relation rel, Buffer buf)
* mistake. On exit, metapage data is correct and we no longer have
* a pin or lock on the metapage.
*
- * XXX this is not used for splitting anymore, only in nbtsort.c at the
- * completion of btree building.
+ * Actually this is not used for splitting on-the-fly anymore. It's only used
+ * in nbtsort.c at the completion of btree building, where we know we have
+ * sole access to the index anyway.
*/
void
_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
/*
* Delete an item from a btree page.
*
+ * This must only be used for deleting leaf items. Deleting an item on a
+ * non-leaf page has to be done as part of an atomic action that includes
+ * deleting the page it points to.
+ *
* This routine assumes that the caller has pinned and locked the buffer,
* and will write the buffer afterwards.
*/
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.96 2003/02/22 00:45:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "access/nbtree.h"
#include "catalog/index.h"
#include "miscadmin.h"
+#include "storage/freespace.h"
/* Working state for btbuild and its callback */
} BTBuildState;
-bool BuildingBtree = false; /* see comment in btbuild() */
bool FastBuild = true; /* use SORT instead of insertion build */
/*
void
AtEOXact_nbtree(void)
{
- /*
- * Note: these actions should only be necessary during xact abort; but
- * they can't hurt during a commit.
- */
-
- /* If we were building a btree, we ain't anymore. */
- BuildingBtree = false;
+ /* nothing to do at the moment */
}
double reltuples;
BTBuildState buildstate;
- /* set flag to disable locking */
- BuildingBtree = true;
-
/*
* bootstrap processing does something strange, so don't use
* sort/build for initial catalog indices. at some point i need to
}
#endif /* BTREE_BUILD_STATS */
- /* all done */
- BuildingBtree = false;
-
/*
* Since we just counted the tuples in the heap, we update its stats
* in pg_class to guarantee that the planner takes advantage of the
* We now need to back up the scan one item, so that the next
* cycle will re-examine the same offnum on this page (which
* now holds the next item).
- *
- * For now, just hack the current-item index. Will need to
- * be smarter when deletion includes removal of empty
- * index pages.
*/
current->ip_posid--;
}
result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
result->num_pages = num_pages;
- result->tuples_removed = tuples_removed;
result->num_index_tuples = num_index_tuples;
+ result->tuples_removed = tuples_removed;
+ result->pages_free = 0; /* not computed here */
PG_RETURN_POINTER(result);
}
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Here, we scan looking for pages we can delete or return to the freelist.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+btvacuumcleanup(PG_FUNCTION_ARGS)
+{
+ Relation rel = (Relation) PG_GETARG_POINTER(0);
+#ifdef NOT_USED
+ IndexVacuumCleanupInfo *info = (IndexVacuumCleanupInfo *) PG_GETARG_POINTER(1);
+#endif
+ IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2);
+ BlockNumber num_pages;
+ BlockNumber blkno;
+ PageFreeSpaceInfo *pageSpaces;
+ int nFreePages,
+ maxFreePages;
+
+ Assert(stats != NULL);
+
+ num_pages = RelationGetNumberOfBlocks(rel);
+
+ /* No point in remembering more than MaxFSMPages pages */
+ maxFreePages = MaxFSMPages;
+ if ((BlockNumber) maxFreePages > num_pages)
+ maxFreePages = (int) num_pages + 1; /* +1 to avoid palloc(0) */
+ pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo));
+ nFreePages = 0;
+
+ /*
+ * Scan through all pages of index, except metapage. (Any pages added
+ * after we start the scan will not be examined; this should be fine,
+ * since they can't possibly be empty.)
+ */
+ for (blkno = BTREE_METAPAGE+1; blkno < num_pages; blkno++)
+ {
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /* XXX if safe-to-reclaim... */
+ if (nFreePages < maxFreePages)
+ {
+ pageSpaces[nFreePages].blkno = blkno;
+ /* The avail-space value is bogus, but must be < BLCKSZ */
+ pageSpaces[nFreePages].avail = BLCKSZ-1;
+ nFreePages++;
+ }
+ }
+ _bt_relbuf(rel, buf);
+ }
+
+ /*
+ * Update the shared Free Space Map with the info we now have about
+ * free space in the index, discarding any old info the map may have.
+ * We do not need to sort the page numbers; they're in order already.
+ */
+ MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces);
+
+ pfree(pageSpaces);
+
+ /* update statistics */
+ stats->num_pages = num_pages;
+ stats->pages_free = nFreePages;
+
+ PG_RETURN_POINTER(stats);
+}
+
/*
* Restore scan position when btgettuple is called to continue a scan.
*
maxoff;
BTPageOpaque opaque;
Buffer nextbuf;
- ItemPointerData target = so->curHeapIptr;
+ ItemPointer target = &(so->curHeapIptr);
BTItem item;
BlockNumber blkno;
* current->ip_posid before first index tuple on the current page
* (_bt_step will move it right)... XXX still needed?
*/
- if (!ItemPointerIsValid(&target))
+ if (!ItemPointerIsValid(target))
{
ItemPointerSetOffsetNumber(current,
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
offnum = OffsetNumberNext(offnum))
{
item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- if (item->bti_itup.t_tid.ip_blkid.bi_hi ==
- target.ip_blkid.bi_hi &&
- item->bti_itup.t_tid.ip_blkid.bi_lo ==
- target.ip_blkid.bi_lo &&
- item->bti_itup.t_tid.ip_posid == target.ip_posid)
+ if (BTTidSame(item->bti_itup.t_tid, *target))
{
/* Found it */
current->ip_posid = offnum;
/*
* The item we're looking for moved right at least one page, so
* move right. We are careful here to pin and read-lock the next
- * page before releasing the current one. This ensures that a
- * concurrent btbulkdelete scan cannot pass our position --- if it
+ * non-dead page before releasing the current one. This ensures that
+ * a concurrent btbulkdelete scan cannot pass our position --- if it
* did, it might be able to reach and delete our target item before
* we can find it again.
*/
if (P_RIGHTMOST(opaque))
- elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!"
+ elog(ERROR, "_bt_restscan: my bits moved right off the end of the world!"
"\n\tRecreate index %s.", RelationGetRelationName(rel));
-
- blkno = opaque->btpo_next;
- nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+ /* Advance to next non-dead page --- there must be one */
+ nextbuf = InvalidBuffer;
+ for (;;)
+ {
+ blkno = opaque->btpo_next;
+ if (nextbuf != InvalidBuffer)
+ _bt_relbuf(rel, nextbuf);
+ nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(nextbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
+ break;
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "_bt_restscan: fell off the end of %s",
+ RelationGetRelationName(rel));
+ }
_bt_relbuf(rel, buf);
so->btso_curbuf = buf = nextbuf;
- page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
offnum = P_FIRSTDATAKEY(opaque);
ItemPointerSet(current, blkno, offnum);
}
/*-------------------------------------------------------------------------
*
* nbtsearch.c
- * search code for postgres btrees.
+ * Search code for postgres btrees.
*
*
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.74 2003/02/22 00:45:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "access/nbtree.h"
+static Buffer _bt_walk_left(Relation rel, Buffer buf);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
par_blkno = BufferGetBlockNumber(*bufP);
/*
- * We need to save the bit image of the index entry we chose in
+ * We need to save the location of the index entry we chose in
* the parent page on a stack. In case we split the tree, we'll
- * use this bit image to figure out what our real parent page is,
- * in case the parent splits while we're working lower in the
+ * use the stack to work back up to the parent page. We also save
+ * the actual downlink (TID) to uniquely identify the index entry,
+ * in case it moves right while we're working lower in the
* tree. See the paper by Lehman and Yao for how this is detected
* and handled. (We use the child link to disambiguate duplicate
* keys in the index -- Lehman and Yao disallow duplicate keys.)
/*
* _bt_moveright() -- move right in the btree if necessary.
*
- * When we drop and reacquire a pointer to a page, it is possible that
+ * When we follow a pointer to reach a page, it is possible that
* the page has changed in the meanwhile. If this happens, we're
* guaranteed that the page has "split right" -- that is, that any
* data that appeared on the page originally is either on the page
* right. (If the scan key is equal to the high key, we might or
* might not need to move right; have to scan the page first anyway.)
* It could even have split more than once, so scan as far as needed.
+ *
+ * We also have to move right if we followed a link that brought us to
+ * a dead page.
*/
while (!P_RIGHTMOST(opaque) &&
- _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
+ (P_IGNORE(opaque) ||
+ _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
{
/* step right one page */
BlockNumber rblkno = opaque->btpo_next;
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
+ if (P_IGNORE(opaque))
+ elog(ERROR, "_bt_moveright: fell off the end of %s",
+ RelationGetRelationName(rel));
+
return buf;
}
OffsetNumber offnum,
maxoff;
BlockNumber blkno;
- BlockNumber obknum;
/*
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
offnum = OffsetNumberNext(offnum);
else
{
- /* walk right to the next page with data */
+ /* Walk right to the next page with data */
for (;;)
{
/* if we're at end of scan, release the buffer and return */
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- /* done if it's not empty */
- offnum = P_FIRSTDATAKEY(opaque);
- if (!PageIsEmpty(page) && offnum <= maxoff)
- break;
+ if (!P_IGNORE(opaque))
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ /* done if it's not empty */
+ offnum = P_FIRSTDATAKEY(opaque);
+ if (!PageIsEmpty(page) && offnum <= maxoff)
+ break;
+ }
}
}
}
- else
+ else /* backwards scan */
{
if (offnum > P_FIRSTDATAKEY(opaque))
offnum = OffsetNumberPrev(offnum);
else
{
- /* walk left to the next page with data */
+ /*
+ * Walk left to the next page with data. This is much more
+ * complex than the walk-right case because of the possibility
+ * that the page to our left splits while we are in flight to it,
+ * plus the possibility that the page we were on gets deleted
+ * after we leave it. See nbtree/README for details.
+ */
for (;;)
{
- /* if we're at end of scan, release the buffer and return */
- if (P_LEFTMOST(opaque))
+ *bufP = _bt_walk_left(rel, *bufP);
+
+ /* if we're at end of scan, return failure */
+ if (*bufP == InvalidBuffer)
{
- _bt_relbuf(rel, *bufP);
ItemPointerSetInvalid(current);
- *bufP = so->btso_curbuf = InvalidBuffer;
+ so->btso_curbuf = InvalidBuffer;
return false;
}
- /* step left */
- obknum = BufferGetBlockNumber(*bufP);
- blkno = opaque->btpo_prev;
- _bt_relbuf(rel, *bufP);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
/*
- * If the adjacent page just split, then we have to walk
- * right to find the block that's now adjacent to where we
- * were. Because pages only split right, we don't have to
- * worry about this failing to terminate.
+ * Okay, we managed to move left to a non-deleted page.
+ * Done if it's not half-dead and not empty. Else loop back
+ * and do it all again.
*/
- while (opaque->btpo_next != obknum)
+ if (!P_IGNORE(opaque))
{
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, *bufP);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = maxoff;
+ if (!PageIsEmpty(page) &&
+ maxoff >= P_FIRSTDATAKEY(opaque))
+ break;
}
- /* done if it's not empty */
- maxoff = PageGetMaxOffsetNumber(page);
- offnum = maxoff;
- if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
- break;
}
}
}
return true;
}
+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked. This will be dropped
+ * before stepping left. On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf)
+{
+ Page page;
+ BTPageOpaque opaque;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;)
+ {
+ BlockNumber obknum;
+ BlockNumber lblkno;
+ BlockNumber blkno;
+ int tries;
+
+ /* if we're at end of tree, release buf and return failure */
+ if (P_LEFTMOST(opaque))
+ {
+ _bt_relbuf(rel, buf);
+ break;
+ }
+ /* remember original page we are stepping left from */
+ obknum = BufferGetBlockNumber(buf);
+ /* step left */
+ blkno = lblkno = opaque->btpo_prev;
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ /*
+ * If this isn't the page we want, walk right till we find
+ * what we want --- but go no more than four hops (an
+ * arbitrary limit). If we don't find the correct page by then,
+ * the most likely bet is that the original page got deleted
+ * and isn't in the sibling chain at all anymore, not that its
+ * left sibling got split more than four times.
+ *
+ * Note that it is correct to test P_ISDELETED not P_IGNORE
+ * here, because half-dead pages are still in the sibling
+ * chain. Caller must reject half-dead pages if wanted.
+ */
+ tries = 0;
+ for (;;)
+ {
+ if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+ {
+ /* Found desired page, return it */
+ return buf;
+ }
+ if (P_RIGHTMOST(opaque) || ++tries > 4)
+ break;
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ /* Return to the original page to see what's up */
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, obknum, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * It was deleted. Move right to first nondeleted page
+ * (there must be one); that is the page that has acquired the
+ * deleted one's keyspace, so stepping left from it will take
+ * us where we want to be.
+ */
+ for (;;)
+ {
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "_bt_walk_left: fell off the end of %s",
+ RelationGetRelationName(rel));
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_ISDELETED(opaque))
+ break;
+ }
+ /*
+ * Now return to top of loop, resetting obknum to
+ * point to this nondeleted page, and try again.
+ */
+ }
+ else
+ {
+ /*
+ * It wasn't deleted; the explanation had better be
+ * that the page to the left got split or deleted.
+ * Without this check, we'd go into an infinite loop
+ * if there's anything wrong.
+ */
+ if (opaque->btpo_prev == lblkno)
+ elog(ERROR, "_bt_walk_left: can't find left sibling in %s",
+ RelationGetRelationName(rel));
+ /* Okay to try again with new lblkno value */
+ }
+ }
+
+ return InvalidBuffer;
+}
+
/*
* _bt_get_endpoint() -- Find the first or last page on a given tree level
*
* If the index is empty, we will return InvalidBuffer; any other failure
- * condition causes elog().
+ * condition causes elog(). We will not return a dead page.
*
* The returned buffer is pinned and read-locked.
*/
* step right if needed to get to it (this could happen if the
* page split since we obtained a pointer to it).
*/
- while (P_ISDELETED(opaque) ||
+ while (P_IGNORE(opaque) ||
(rightmost && !P_RIGHTMOST(opaque)))
{
blkno = opaque->btpo_next;
if (blkno == P_NONE)
- elog(ERROR, "_bt_get_endpoint: ran off end of btree");
+ elog(ERROR, "_bt_get_endpoint: fell off the end of %s",
+ RelationGetRelationName(rel));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
if (opaque->btpo.level < level)
elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
- /* Step to leftmost or rightmost child page */
+ /* Descend to leftmost or rightmost child page */
if (rightmost)
offnum = PageGetMaxOffsetNumber(page);
else
/*-------------------------------------------------------------------------
+ *
* nbtsort.c
* Build a btree from sorted input by loading leaf pages sequentially.
*
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.72 2003/02/22 00:45:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
ResetUsage();
}
#endif /* BTREE_BUILD_STATS */
- tuplesort_performsort(btspool->sortstate);
+ tuplesort_performsort(btspool->sortstate);
if (btspool2)
tuplesort_performsort(btspool2->sortstate);
_bt_load(btspool->index, btspool, btspool2);
if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
LP_USED) == InvalidOffsetNumber)
- elog(FATAL, "btree: failed to add item to the page in _bt_sort");
+ elog(ERROR, "btree: failed to add item to the page in _bt_sort");
}
/*----------
/*
* Write out the old page. We never want to see it again, so we
- * can give up our lock (if we had one; most likely BuildingBtree
- * is set, so we aren't locking).
+ * can give up our lock.
*/
_bt_blwritepage(index, obuf);
if (s->btps_next == (BTPageState *) NULL)
{
opaque->btpo_flags |= BTP_ROOT;
- _bt_metaproot(index, blkno, s->btps_level + 1);
+ _bt_metaproot(index, blkno, s->btps_level);
}
else
{
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.75 2002/09/04 20:31:13 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.76 2003/02/22 00:45:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
result->num_pages = num_pages;
- result->tuples_removed = tuples_removed;
result->num_index_tuples = num_index_tuples;
+ result->tuples_removed = tuples_removed;
+ result->pages_free = 0;
PG_RETURN_POINTER(result);
}
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.247 2003/02/09 06:56:27 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.248 2003/02/22 00:45:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
scan_index(Relation indrel, double num_tuples)
{
IndexBulkDeleteResult *stats;
+ IndexVacuumCleanupInfo vcinfo;
VacRUsage ru0;
vac_init_rusage(&ru0);
/*
- * Even though we're not planning to delete anything, use the
- * ambulkdelete call, so that the scan happens within the index AM for
- * more speed.
+ * Even though we're not planning to delete anything, we use the
+ * ambulkdelete call, because (a) the scan happens within the index AM
+ * for more speed, and (b) it may want to pass private statistics to
+ * the amvacuumcleanup call.
*/
stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
+ /* Do post-VACUUM cleanup, even though we deleted nothing */
+ vcinfo.vacuum_full = true;
+ vcinfo.message_level = elevel;
+
+ stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
if (!stats)
return;
stats->num_pages, stats->num_index_tuples,
false);
- elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+ elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
RelationGetRelationName(indrel),
- stats->num_pages, stats->num_index_tuples,
+ stats->num_pages, stats->pages_free, stats->num_index_tuples,
vac_show_rusage(&ru0));
/*
double num_tuples, int keep_tuples)
{
IndexBulkDeleteResult *stats;
+ IndexVacuumCleanupInfo vcinfo;
VacRUsage ru0;
vac_init_rusage(&ru0);
/* Do bulk deletion */
stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
+ /* Do post-VACUUM cleanup */
+ vcinfo.vacuum_full = true;
+ vcinfo.message_level = elevel;
+
+ stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
if (!stats)
return;
stats->num_pages, stats->num_index_tuples,
false);
- elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
- RelationGetRelationName(indrel), stats->num_pages,
+ elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+ RelationGetRelationName(indrel),
+ stats->num_pages, stats->pages_free,
stats->num_index_tuples - keep_tuples, stats->tuples_removed,
vac_show_rusage(&ru0));
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.23 2002/11/13 00:39:46 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.24 2003/02/22 00:45:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
tups_vacuumed,
nkeep,
nunused;
- bool did_vacuum_index = false;
int i;
VacRUsage ru0;
/* Remove index entries */
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i], vacrelstats);
- did_vacuum_index = true;
/* Remove tuples from heap */
lazy_vacuum_heap(onerel, vacrelstats);
/* Forget the now-vacuumed tuples, and press on */
vacrelstats->rel_tuples = num_tuples;
/* If any tuples need to be deleted, perform final vacuum cycle */
- /* XXX put a threshold on min nuber of tuples here? */
+ /* XXX put a threshold on min number of tuples here? */
if (vacrelstats->num_dead_tuples > 0)
{
/* Remove index entries */
/* Remove tuples from heap */
lazy_vacuum_heap(onerel, vacrelstats);
}
- else if (!did_vacuum_index)
+ else
{
- /* Scan indexes just to update pg_class statistics about them */
+ /* Must do post-vacuum cleanup and statistics update anyway */
for (i = 0; i < nindexes; i++)
lazy_scan_index(Irel[i], vacrelstats);
}
lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
{
IndexBulkDeleteResult *stats;
+ IndexVacuumCleanupInfo vcinfo;
VacRUsage ru0;
vac_init_rusage(&ru0);
/*
- * If the index is not partial, skip the scan, and just assume it has
- * the same number of tuples as the heap.
- */
- if (!vac_is_partial_index(indrel))
- {
- vac_update_relstats(RelationGetRelid(indrel),
- RelationGetNumberOfBlocks(indrel),
- vacrelstats->rel_tuples,
- false);
- return;
- }
-
- /*
- * If index is unsafe for concurrent access, must lock it; but a
- * shared lock should be sufficient.
+ * If index is unsafe for concurrent access, must lock it.
*/
if (!indrel->rd_am->amconcurrent)
- LockRelation(indrel, AccessShareLock);
+ LockRelation(indrel, AccessExclusiveLock);
/*
- * Even though we're not planning to delete anything, use the
- * ambulkdelete call, so that the scan happens within the index AM for
- * more speed.
+ * Even though we're not planning to delete anything, we use the
+ * ambulkdelete call, because (a) the scan happens within the index AM
+ * for more speed, and (b) it may want to pass private statistics to
+ * the amvacuumcleanup call.
*/
stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
+ /* Do post-VACUUM cleanup, even though we deleted nothing */
+ vcinfo.vacuum_full = false;
+ vcinfo.message_level = elevel;
+
+ stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
/*
* Release lock acquired above.
*/
if (!indrel->rd_am->amconcurrent)
- UnlockRelation(indrel, AccessShareLock);
+ UnlockRelation(indrel, AccessExclusiveLock);
if (!stats)
return;
stats->num_pages, stats->num_index_tuples,
false);
- elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+ elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
RelationGetRelationName(indrel),
- stats->num_pages, stats->num_index_tuples,
+ stats->num_pages, stats->pages_free, stats->num_index_tuples,
vac_show_rusage(&ru0));
pfree(stats);
lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats)
{
IndexBulkDeleteResult *stats;
+ IndexVacuumCleanupInfo vcinfo;
VacRUsage ru0;
vac_init_rusage(&ru0);
/* Do bulk deletion */
stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);
+ /* Do post-VACUUM cleanup */
+ vcinfo.vacuum_full = false;
+ vcinfo.message_level = elevel;
+
+ stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
/*
* Release lock acquired above.
*/
if (!indrel->rd_am->amconcurrent)
UnlockRelation(indrel, AccessExclusiveLock);
+ if (!stats)
+ return;
+
/* now update statistics in pg_class */
- if (stats)
- {
- vac_update_relstats(RelationGetRelid(indrel),
- stats->num_pages, stats->num_index_tuples,
- false);
+ vac_update_relstats(RelationGetRelid(indrel),
+ stats->num_pages, stats->num_index_tuples,
+ false);
- elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
- RelationGetRelationName(indrel), stats->num_pages,
- stats->num_index_tuples, stats->tuples_removed,
- vac_show_rusage(&ru0));
+ elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+ RelationGetRelationName(indrel),
+ stats->num_pages, stats->pages_free,
+ stats->num_index_tuples, stats->tuples_removed,
+ vac_show_rusage(&ru0));
- pfree(stats);
- }
+ pfree(stats);
}
/*
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: genam.h,v 1.37 2002/09/04 20:31:36 momjian Exp $
+ * $Id: genam.h,v 1.38 2003/02/22 00:45:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "nodes/primnodes.h"
-/* Struct for statistics returned by bulk-delete operation */
+/*
+ * Struct for statistics returned by bulk-delete operation
+ *
+ * This is now also passed to the index AM's vacuum-cleanup operation,
+ * if it has one, which can modify the results as needed. Note that
+ * an index AM could choose to have bulk-delete return a larger struct
+ * of which this is just the first field; this provides a way for bulk-delete
+ * to communicate additional private data to vacuum-cleanup.
+ */
typedef struct IndexBulkDeleteResult
{
BlockNumber num_pages; /* pages remaining in index */
+ double num_index_tuples; /* tuples remaining */
double tuples_removed; /* # removed by bulk-delete operation */
- double num_index_tuples; /* # remaining */
+ BlockNumber pages_free; /* # unused pages in index */
} IndexBulkDeleteResult;
/* Typedef for callback function to determine if a tuple is bulk-deletable */
typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
+/* Struct for additional arguments passed to vacuum-cleanup operation */
+typedef struct IndexVacuumCleanupInfo
+{
+ bool vacuum_full; /* VACUUM FULL (we have exclusive lock) */
+ int message_level; /* elog level for progress messages */
+} IndexVacuumCleanupInfo;
/* Struct for heap-or-index scans of system tables */
typedef struct SysScanDescData
extern IndexBulkDeleteResult *index_bulk_delete(Relation indexRelation,
IndexBulkDeleteCallback callback,
void *callback_state);
+extern IndexBulkDeleteResult *index_vacuum_cleanup(Relation indexRelation,
+ IndexVacuumCleanupInfo *info,
+ IndexBulkDeleteResult *stats);
extern RegProcedure index_cost_estimator(Relation indexRelation);
extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
uint16 procnum);
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
+ * $Id: nbtree.h,v 1.65 2003/02/22 00:45:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
#define BTP_META (1 << 3) /* meta-page */
+#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
/*
#define SizeOfBTItem sizeof(BTItemData)
/* Test whether items are the "same" per the above notes */
-#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
- (i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
- (i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
- (i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
- (i1)->bti_itup.t_tid.ip_posid == \
- (i2)->bti_itup.t_tid.ip_posid )
+#define BTTidSame(i1, i2) \
+ ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
+ (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
+ (i1).ip_posid == (i2).ip_posid )
+#define BTItemSame(i1, i2) \
+ BTTidSame((i1)->bti_itup.t_tid, (i2)->bti_itup.t_tid)
+
/*
* In general, the btree code tries to localize its knowledge about
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
+#define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
/*
* prototypes for functions in nbtree.c (external entry points for btree)
*/
-extern bool BuildingBtree; /* in nbtree.c */
-
extern void AtEOXact_nbtree(void);
extern Datum btbuild(PG_FUNCTION_ARGS);
extern Datum btmarkpos(PG_FUNCTION_ARGS);
extern Datum btrestrpos(PG_FUNCTION_ARGS);
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
+extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
/*
* prototypes for functions in nbtinsert.c
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
+ * $Id: xlog.h,v 1.42 2003/02/22 00:45:05 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
#define XLR_INFO_MASK 0x0F
/*
- * We support backup of up to 2 disk blocks per XLOG record (could support
- * more if we cared to dedicate more xl_info bits for this purpose; currently
- * do not need more than 2 anyway). If we backed up any disk blocks then we
- * use flag bits in xl_info to signal it.
+ * If we backed up any disk blocks with the XLOG record, we use flag bits in
+ * xl_info to signal it. We support backup of up to 3 disk blocks per XLOG
+ * record. (Could support 4 if we cared to dedicate all the xl_info bits for
+ * this purpose; currently bit 0 of xl_info is unused and available.)
*/
-#define XLR_BKP_BLOCK_MASK 0x0C /* all info bits used for bkp
+#define XLR_BKP_BLOCK_MASK 0x0E /* all info bits used for bkp
* blocks */
-#define XLR_MAX_BKP_BLOCKS 2
+#define XLR_MAX_BKP_BLOCKS 3
#define XLR_SET_BKP_BLOCK(iblk) (0x08 >> (iblk))
#define XLR_BKP_BLOCK_1 XLR_SET_BKP_BLOCK(0) /* 0x08 */
#define XLR_BKP_BLOCK_2 XLR_SET_BKP_BLOCK(1) /* 0x04 */
+#define XLR_BKP_BLOCK_3 XLR_SET_BKP_BLOCK(2) /* 0x02 */
/*
* Sometimes we log records which are out of transaction control.
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
+ * $Id: catversion.h,v 1.179 2003/02/22 00:45:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 200302171
+#define CATALOG_VERSION_NO 200302211
#endif
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: pg_am.h,v 1.23 2002/07/29 22:14:11 tgl Exp $
+ * $Id: pg_am.h,v 1.24 2003/02/22 00:45:05 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
regproc amrestrpos; /* "restore marked scan position" function */
regproc ambuild; /* "build new index" function */
regproc ambulkdelete; /* bulk-delete function */
+ regproc amvacuumcleanup; /* post-VACUUM cleanup function */
regproc amcostestimate; /* estimate cost of an indexscan */
} FormData_pg_am;
* compiler constants for pg_am
* ----------------
*/
-#define Natts_pg_am 19
+#define Natts_pg_am 20
#define Anum_pg_am_amname 1
#define Anum_pg_am_amowner 2
#define Anum_pg_am_amstrategies 3
#define Anum_pg_am_amrestrpos 16
#define Anum_pg_am_ambuild 17
#define Anum_pg_am_ambulkdelete 18
-#define Anum_pg_am_amcostestimate 19
+#define Anum_pg_am_amvacuumcleanup 19
+#define Anum_pg_am_amcostestimate 20
/* ----------------
* initial contents of pg_am
* ----------------
*/
-DATA(insert OID = 402 ( rtree PGUID 8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete rtcostestimate ));
+DATA(insert OID = 402 ( rtree PGUID 8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete - rtcostestimate ));
DESCR("r-tree index access method");
-DATA(insert OID = 403 ( btree PGUID 5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btcostestimate ));
+DATA(insert OID = 403 ( btree PGUID 5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate ));
DESCR("b-tree index access method");
#define BTREE_AM_OID 403
-DATA(insert OID = 405 ( hash PGUID 1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashcostestimate ));
+DATA(insert OID = 405 ( hash PGUID 1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete - hashcostestimate ));
DESCR("hash index access method");
-DATA(insert OID = 783 ( gist PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistcostestimate ));
+DATA(insert OID = 783 ( gist PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete - gistcostestimate ));
DESCR("GiST index access method");
#define GIST_AM_OID 783
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: pg_proc.h,v 1.283 2003/02/13 05:24:02 momjian Exp $
+ * $Id: pg_proc.h,v 1.284 2003/02/22 00:45:05 tgl Exp $
*
* NOTES
* The script catalog/genbki.sh reads this file and generates .bki
DESCR("btree(internal)");
DATA(insert OID = 332 ( btbulkdelete PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btbulkdelete - _null_ ));
DESCR("btree(internal)");
+DATA(insert OID = 972 ( btvacuumcleanup PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btvacuumcleanup - _null_ ));
+DESCR("btree(internal)");
DATA(insert OID = 1268 ( btcostestimate PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281" btcostestimate - _null_ ));
DESCR("btree(internal)");