More infrastructure for btree compaction project. Tree-traversal code

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 4d38bef2370f0eb1f9d9c98863298e227a8f5e70..4b50e8f442769ba60ce540e5ab8a0fca4eae4787 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1,6 +1,6 @@
  <!--
   Documentation of the system catalogs, directed toward PostgreSQL developers
- $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.65 2003/01/19 00:13:28 momjian Exp $
+ $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.66 2003/02/22 00:45:03 tgl Exp $
   -->
  
  <chapter id="catalogs">
@@ -446,6 +446,13 @@
        <entry>bulk-delete function</entry>
       </row>
  
+     <row>
+      <entry>amvacuumcleanup</entry>
+      <entry><type>regproc</type></entry>
+      <entry>pg_proc.oid</entry>
+      <entry>post-VACUUM cleanup function</entry>
+     </row>
+
       <row>
        <entry>amcostestimate</entry>
        <entry><type>regproc</type></entry>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c

index 6591e766448f914001a507bcc3a7537da3b58ee5..472bcf4527661ad4580dda0bf51f405cae946ba9 100644 (file)
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.99 2002/11/13 00:39:46 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.100 2003/02/22 00:45:03 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1650,8 +1650,9 @@ gistbulkdelete(PG_FUNCTION_ARGS)
  
         result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
         result->num_pages = num_pages;
-       result->tuples_removed = tuples_removed;
         result->num_index_tuples = num_index_tuples;
+       result->tuples_removed = tuples_removed;
+       result->pages_free = 0;
  
         PG_RETURN_POINTER(result);
  }
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index 3a75265f012a0512a909a0901a6984dcd1263387..0ec2380cef0fa0ef23d011d9e1a1fb8af0aaad14 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.60 2002/09/04 20:31:09 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.61 2003/02/22 00:45:03 tgl Exp $
   *
   * NOTES
   *       This file contains only the public interface routines.
@@ -491,8 +491,9 @@ hashbulkdelete(PG_FUNCTION_ARGS)
  
         result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
         result->num_pages = num_pages;
-       result->tuples_removed = tuples_removed;
         result->num_index_tuples = num_index_tuples;
+       result->tuples_removed = tuples_removed;
+       result->pages_free = 0;
  
         PG_RETURN_POINTER(result);
  }
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c

index 258eb546a4911d572888e4ced35f6be0cb879cdd..d045bafc1c86eaf2a1c235fd7fcb877e30673e87 100644 (file)
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.63 2003/01/08 19:41:40 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.64 2003/02/22 00:45:03 tgl Exp $
   *
   * INTERFACE ROUTINES
   *             index_open              - open an index relation by relation OID
@@ -23,6 +23,7 @@
   *             index_restrpos  - restore a scan position
   *             index_getnext   - get the next tuple from a scan
   *             index_bulk_delete       - bulk deletion of index tuples
+ *             index_vacuum_cleanup    - post-deletion cleanup of an index
   *             index_cost_estimator    - fetch amcostestimate procedure OID
   *             index_getprocid - get a support procedure OID
   *
@@ -579,6 +580,37 @@ index_bulk_delete(Relation indexRelation,
         return result;
  }
  
+/* ----------------
+ *             index_vacuum_cleanup - do post-deletion cleanup of an index
+ *
+ *             return value is an optional palloc'd struct of statistics
+ * ----------------
+ */
+IndexBulkDeleteResult *
+index_vacuum_cleanup(Relation indexRelation,
+                                        IndexVacuumCleanupInfo *info,
+                                        IndexBulkDeleteResult *stats)
+{
+       RegProcedure procedure;
+       IndexBulkDeleteResult *result;
+
+       RELATION_CHECKS;
+
+       /* It's okay for an index AM not to have a vacuumcleanup procedure */
+       if (!RegProcedureIsValid(indexRelation->rd_am->amvacuumcleanup))
+               return stats;
+
+       GET_REL_PROCEDURE(vacuum_cleanup, amvacuumcleanup);
+
+       result = (IndexBulkDeleteResult *)
+               DatumGetPointer(OidFunctionCall3(procedure,
+                                                                                PointerGetDatum(indexRelation),
+                                                                                PointerGetDatum((Pointer) info),
+                                                                                PointerGetDatum((Pointer) stats)));
+
+       return result;
+}
+
  /* ----------------
   *             index_cost_estimator
   *
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index a93a9fed8c67b80c848cf511894d601950ac04a4..e943ca96f1ba1854c12a31949e9158bafacf6731 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.98 2003/02/22 00:45:03 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -280,12 +280,21 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
                         if (!_bt_isequal(itupdesc, page, P_HIKEY,
                                                          natts, itup_scankey))
                                 break;
-                       nblkno = opaque->btpo_next;
-                       if (nbuf != InvalidBuffer)
-                               _bt_relbuf(rel, nbuf);
-                       nbuf = _bt_getbuf(rel, nblkno, BT_READ);
-                       page = BufferGetPage(nbuf);
-                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       /* Advance to next non-dead page --- there must be one */
+                       for (;;)
+                       {
+                               nblkno = opaque->btpo_next;
+                               if (nbuf != InvalidBuffer)
+                                       _bt_relbuf(rel, nbuf);
+                               nbuf = _bt_getbuf(rel, nblkno, BT_READ);
+                               page = BufferGetPage(nbuf);
+                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                               if (!P_IGNORE(opaque))
+                                       break;
+                               if (P_RIGHTMOST(opaque))
+                                       elog(ERROR, "_bt_check_unique: fell off the end of %s",
+                                                RelationGetRelationName(rel));
+                       }
                         maxoff = PageGetMaxOffsetNumber(page);
                         offset = P_FIRSTDATAKEY(opaque);
                 }
@@ -414,20 +423,34 @@ _bt_insertonpg(Relation rel,
                            _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
                            random() > (MAX_RANDOM_VALUE / 100))
                 {
-                       /* step right one page */
-                       BlockNumber rblkno = lpageop->btpo_next;
-                       Buffer          rbuf;
-
                         /*
-                        * must write-lock next page before releasing write lock on
+                        * step right to next non-dead page
+                        *
+                        * must write-lock that page before releasing write lock on
                          * current page; else someone else's _bt_check_unique scan
-                        * could fail to see our insertion.
+                        * could fail to see our insertion.  write locks on intermediate
+                        * dead pages won't do because we don't know when they will get
+                        * de-linked from the tree.
                          */
-                       rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+                       Buffer          rbuf = InvalidBuffer;
+
+                       for (;;)
+                       {
+                               BlockNumber rblkno = lpageop->btpo_next;
+
+                               if (rbuf != InvalidBuffer)
+                                       _bt_relbuf(rel, rbuf);
+                               rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+                               page = BufferGetPage(rbuf);
+                               lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+                               if (!P_IGNORE(lpageop))
+                                       break;
+                               if (P_RIGHTMOST(lpageop))
+                                       elog(ERROR, "_bt_insertonpg: fell off the end of %s",
+                                                RelationGetRelationName(rel));
+                       }
                         _bt_relbuf(rel, buf);
                         buf = rbuf;
-                       page = BufferGetPage(buf);
-                       lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
                         movedright = true;
                 }
  
@@ -633,8 +656,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
         BTPageOpaque ropaque,
                                 lopaque,
                                 oopaque;
-       Buffer          sbuf = 0;
-       Page            spage = 0;
+       Buffer          sbuf = InvalidBuffer;
+       Page            spage = NULL;
+       BTPageOpaque sopaque = NULL;
         Size            itemsz;
         ItemId          itemid;
         BTItem          item;
@@ -792,6 +816,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
         {
                 sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
                 spage = BufferGetPage(sbuf);
+               sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+               if (sopaque->btpo_prev != ropaque->btpo_prev)
+                       elog(PANIC, "btree: right sibling's left-link doesn't match");
         }
  
         /*
@@ -802,6 +829,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
          */
         START_CRIT_SECTION();
  
+       if (!P_RIGHTMOST(ropaque))
+               sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
         /* XLOG stuff */
         if (!rel->rd_istemp)
         {
@@ -847,10 +877,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
  
                 if (!P_RIGHTMOST(ropaque))
                 {
-                       BTPageOpaque sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
-
-                       sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
-
                         rdata[2].next = &(rdata[3]);
                         rdata[3].buffer = sbuf;
                         rdata[3].data = NULL;
@@ -1250,58 +1276,63 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
                 Buffer          buf;
                 Page            page;
                 BTPageOpaque opaque;
-               OffsetNumber offnum,
-                                       minoff,
-                                       maxoff;
-               ItemId          itemid;
-               BTItem          item;
  
                 buf = _bt_getbuf(rel, blkno, access);
                 page = BufferGetPage(buf);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-               minoff = P_FIRSTDATAKEY(opaque);
-               maxoff = PageGetMaxOffsetNumber(page);
  
-               /*
-                * start = InvalidOffsetNumber means "search the whole page".
-                * We need this test anyway due to possibility that
-                * page has a high key now when it didn't before.
-                */
-               if (start < minoff)
-                       start = minoff;
-
-               /*
-                * These loops will check every item on the page --- but in an order
-                * that's attuned to the probability of where it actually is.  Scan
-                * to the right first, then to the left.
-                */
-               for (offnum = start;
-                        offnum <= maxoff;
-                        offnum = OffsetNumberNext(offnum))
+               if (!P_IGNORE(opaque))
                 {
-                       itemid = PageGetItemId(page, offnum);
-                       item = (BTItem) PageGetItem(page, itemid);
-                       if (BTItemSame(item, &stack->bts_btitem))
+                       OffsetNumber offnum,
+                                               minoff,
+                                               maxoff;
+                       ItemId          itemid;
+                       BTItem          item;
+
+                       minoff = P_FIRSTDATAKEY(opaque);
+                       maxoff = PageGetMaxOffsetNumber(page);
+
+                       /*
+                        * start = InvalidOffsetNumber means "search the whole page".
+                        * We need this test anyway due to possibility that
+                        * page has a high key now when it didn't before.
+                        */
+                       if (start < minoff)
+                               start = minoff;
+
+                       /*
+                        * These loops will check every item on the page --- but in an
+                        * order that's attuned to the probability of where it actually
+                        * is.  Scan to the right first, then to the left.
+                        */
+                       for (offnum = start;
+                                offnum <= maxoff;
+                                offnum = OffsetNumberNext(offnum))
                         {
-                               /* Return accurate pointer to where link is now */
-                               stack->bts_blkno = blkno;
-                               stack->bts_offset = offnum;
-                               return buf;
+                               itemid = PageGetItemId(page, offnum);
+                               item = (BTItem) PageGetItem(page, itemid);
+                               if (BTItemSame(item, &stack->bts_btitem))
+                               {
+                                       /* Return accurate pointer to where link is now */
+                                       stack->bts_blkno = blkno;
+                                       stack->bts_offset = offnum;
+                                       return buf;
+                               }
                         }
-               }
  
-               for (offnum = OffsetNumberPrev(start);
-                        offnum >= minoff;
-                        offnum = OffsetNumberPrev(offnum))
-               {
-                       itemid = PageGetItemId(page, offnum);
-                       item = (BTItem) PageGetItem(page, itemid);
-                       if (BTItemSame(item, &stack->bts_btitem))
+                       for (offnum = OffsetNumberPrev(start);
+                                offnum >= minoff;
+                                offnum = OffsetNumberPrev(offnum))
                         {
-                               /* Return accurate pointer to where link is now */
-                               stack->bts_blkno = blkno;
-                               stack->bts_offset = offnum;
-                               return buf;
+                               itemid = PageGetItemId(page, offnum);
+                               item = (BTItem) PageGetItem(page, itemid);
+                               if (BTItemSame(item, &stack->bts_btitem))
+                               {
+                                       /* Return accurate pointer to where link is now */
+                                       stack->bts_blkno = blkno;
+                                       stack->bts_offset = offnum;
+                                       return buf;
+                               }
                         }
                 }
  
@@ -1365,6 +1396,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
         rootpage = BufferGetPage(rootbuf);
         rootblknum = BufferGetBlockNumber(rootbuf);
+
+       /* acquire lock on the metapage */
         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
         metapg = BufferGetPage(metabuf);
         metad = BTPageGetMeta(metapg);
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c

index c9879b73ae601623549ce2c5a90a9a7d006a8686..0296b71c3633b6f46e12d9b236b3725af3dccd48 100644 (file)
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.60 2003/02/22 00:45:04 tgl Exp $
   *
   *     NOTES
   *        Postgres btree pages look like ordinary relation pages.      The opaque
@@ -22,34 +22,17 @@
   */
  #include "postgres.h"
  
-#include <time.h>
-
  #include "access/nbtree.h"
  #include "miscadmin.h"
  #include "storage/lmgr.h"
  
-extern bool FixBTree;                  /* comments in nbtree.c */
-extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
-
-/*
- *     We use high-concurrency locking on btrees.      There are two cases in
- *     which we don't do locking.  One is when we're building the btree.
- *     Since the creating transaction has not committed, no one can see
- *     the index, and there's no reason to share locks.  The second case
- *     is when we're just starting up the database system.  We use some
- *     special-purpose initialization code in the relation cache manager
- *     (see utils/cache/relcache.c) to allow us to do indexed scans on
- *     the system catalogs before we'd normally be able to.  This happens
- *     before the lock table is fully initialized, so we can't use it.
- *     Strictly speaking, this violates 2pl, but we don't do 2pl on the
- *     system catalogs anyway, so I declare this to be okay.
- */
-
-#define USELOCKING             (!BuildingBtree && !IsInitProcessingMode())
-
  
  /*
   *     _bt_metapinit() -- Initialize the metadata page of a new btree.
+ *
+ * Note: there's no real need for any locking here.  Since the transaction
+ * creating the index hasn't committed yet, no one else can even see the index
+ * much less be trying to use it.
   */
  void
  _bt_metapinit(Relation rel)
@@ -59,10 +42,6 @@ _bt_metapinit(Relation rel)
         BTMetaPageData *metad;
         BTPageOpaque op;
  
-       /* can't be sharing this with anyone, now... */
-       if (USELOCKING)
-               LockRelation(rel, AccessExclusiveLock);
-
         if (RelationGetNumberOfBlocks(rel) != 0)
                 elog(ERROR, "Cannot initialize non-empty btree %s",
                          RelationGetRelationName(rel));
@@ -114,10 +93,6 @@ _bt_metapinit(Relation rel)
         END_CRIT_SECTION();
  
         WriteBuffer(buf);
-
-       /* all done */
-       if (USELOCKING)
-               UnlockRelation(rel, AccessExclusiveLock);
  }
  
  /*
@@ -142,7 +117,8 @@ _bt_metapinit(Relation rel)
   *             what we will return is the old root, which is now just the leftmost
   *             page on a probably-not-very-wide level.  For most purposes this is
   *             as good as or better than the true root, so we do not bother to
- *             insist on finding the true root.
+ *             insist on finding the true root.  We do, however, guarantee to
+ *             return a live (not deleted or half-dead) page.
   *
   *             On successful return, the root page is pinned and read-locked.
   *             The metadata page is not locked or pinned on exit.
@@ -157,6 +133,7 @@ _bt_getroot(Relation rel, int access)
         Page            rootpage;
         BTPageOpaque rootopaque;
         BlockNumber rootblkno;
+       uint32          rootlevel;
         BTMetaPageData *metad;
  
         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
@@ -164,6 +141,7 @@ _bt_getroot(Relation rel, int access)
         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
         metad = BTPageGetMeta(metapg);
  
+       /* sanity-check the metapage */
         if (!(metaopaque->btpo_flags & BTP_META) ||
                 metad->btm_magic != BTREE_MAGIC)
                 elog(ERROR, "Index %s is not a btree",
@@ -191,90 +169,113 @@ _bt_getroot(Relation rel, int access)
                 /*
                  * Race condition:      if someone else initialized the metadata
                  * between the time we released the read lock and acquired the
-                * write lock, above, we must avoid doing it again.
+                * write lock, we must avoid doing it again.
                  */
-               if (metad->btm_root == P_NONE)
+               if (metad->btm_root != P_NONE)
                 {
                         /*
-                        * Get, initialize, write, and leave a lock of the appropriate
-                        * type on the new root page.  Since this is the first page in
-                        * the tree, it's a leaf as well as the root.
+                        * Metadata initialized by someone else.  In order to
+                        * guarantee no deadlocks, we have to release the metadata
+                        * page and start all over again.  (Is that really true?
+                        * But it's hardly worth trying to optimize this case.)
                          */
-                       rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
-                       rootblkno = BufferGetBlockNumber(rootbuf);
-                       rootpage = BufferGetPage(rootbuf);
-
-                       _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
-                       rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
-                       rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
-                       rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
-                       rootopaque->btpo.level = 0;
-
-                       /* NO ELOG(ERROR) till meta is updated */
-                       START_CRIT_SECTION();
-
-                       metad->btm_root = rootblkno;
-                       metad->btm_level = 0;
-                       metad->btm_fastroot = rootblkno;
-                       metad->btm_fastlevel = 0;
+                       _bt_relbuf(rel, metabuf);
+                       return _bt_getroot(rel, access);
+               }
  
-                       /* XLOG stuff */
-                       if (!rel->rd_istemp)
-                       {
-                               xl_btree_newroot xlrec;
-                               XLogRecPtr      recptr;
-                               XLogRecData rdata;
+               /*
+                * Get, initialize, write, and leave a lock of the appropriate
+                * type on the new root page.  Since this is the first page in
+                * the tree, it's a leaf as well as the root.
+                */
+               rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+               rootblkno = BufferGetBlockNumber(rootbuf);
+               rootpage = BufferGetPage(rootbuf);
+
+               _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+               rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+               rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+               rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+               rootopaque->btpo.level = 0;
+
+               /* NO ELOG(ERROR) till meta is updated */
+               START_CRIT_SECTION();
+
+               metad->btm_root = rootblkno;
+               metad->btm_level = 0;
+               metad->btm_fastroot = rootblkno;
+               metad->btm_fastlevel = 0;
+
+               /* XLOG stuff */
+               if (!rel->rd_istemp)
+               {
+                       xl_btree_newroot xlrec;
+                       XLogRecPtr      recptr;
+                       XLogRecData rdata;
  
-                               xlrec.node = rel->rd_node;
-                               xlrec.rootblk = rootblkno;
-                               xlrec.level = 0;
+                       xlrec.node = rel->rd_node;
+                       xlrec.rootblk = rootblkno;
+                       xlrec.level = 0;
  
-                               rdata.buffer = InvalidBuffer;
-                               rdata.data = (char *) &xlrec;
-                               rdata.len = SizeOfBtreeNewroot;
-                               rdata.next = NULL;
+                       rdata.buffer = InvalidBuffer;
+                       rdata.data = (char *) &xlrec;
+                       rdata.len = SizeOfBtreeNewroot;
+                       rdata.next = NULL;
  
-                               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
+                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
  
-                               PageSetLSN(rootpage, recptr);
-                               PageSetSUI(rootpage, ThisStartUpID);
-                               PageSetLSN(metapg, recptr);
-                               PageSetSUI(metapg, ThisStartUpID);
-                       }
+                       PageSetLSN(rootpage, recptr);
+                       PageSetSUI(rootpage, ThisStartUpID);
+                       PageSetLSN(metapg, recptr);
+                       PageSetSUI(metapg, ThisStartUpID);
+               }
  
-                       END_CRIT_SECTION();
+               END_CRIT_SECTION();
  
-                       _bt_wrtnorelbuf(rel, rootbuf);
+               _bt_wrtnorelbuf(rel, rootbuf);
  
-                       /*
-                        * swap root write lock for read lock.  There is no danger of
-                        * anyone else accessing the new root page while it's unlocked,
-                        * since no one else knows where it is yet.
-                        */
-                       LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-                       LockBuffer(rootbuf, BT_READ);
+               /*
+                * swap root write lock for read lock.  There is no danger of
+                * anyone else accessing the new root page while it's unlocked,
+                * since no one else knows where it is yet.
+                */
+               LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
+               LockBuffer(rootbuf, BT_READ);
  
-                       /* okay, metadata is correct, write and release it */
-                       _bt_wrtbuf(rel, metabuf);
-               }
-               else
-               {
-                       /*
-                        * Metadata initialized by someone else.  In order to
-                        * guarantee no deadlocks, we have to release the metadata
-                        * page and start all over again.
-                        */
-                       _bt_relbuf(rel, metabuf);
-                       return _bt_getroot(rel, access);
-               }
+               /* okay, metadata is correct, write and release it */
+               _bt_wrtbuf(rel, metabuf);
         }
         else
         {
                 rootblkno = metad->btm_fastroot;
+               Assert(rootblkno != P_NONE);
+               rootlevel = metad->btm_fastlevel;
  
                 _bt_relbuf(rel, metabuf);               /* done with the meta page */
  
-               rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+               for (;;)
+               {
+                       rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+                       rootpage = BufferGetPage(rootbuf);
+                       rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+                       if (!P_IGNORE(rootopaque))
+                               break;
+
+                       /* it's dead, Jim.  step right one page */
+                       if (P_RIGHTMOST(rootopaque))
+                               elog(ERROR, "No live root page found in %s",
+                                        RelationGetRelationName(rel));
+                       rootblkno = rootopaque->btpo_next;
+
+                       _bt_relbuf(rel, rootbuf);
+               }
+
+               /* Note: can't check btpo.level on deleted pages */
+               if (rootopaque->btpo.level != rootlevel)
+                       elog(ERROR, "Root page %u of %s has level %u, expected %u",
+                                rootblkno, RelationGetRelationName(rel),
+                                rootopaque->btpo.level, rootlevel);
         }
  
         /*
@@ -305,7 +306,10 @@ _bt_gettrueroot(Relation rel)
         Page            metapg;
         BTPageOpaque metaopaque;
         Buffer          rootbuf;
+       Page            rootpage;
+       BTPageOpaque rootopaque;
         BlockNumber rootblkno;
+       uint32          rootlevel;
         BTMetaPageData *metad;
  
         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
@@ -331,10 +335,33 @@ _bt_gettrueroot(Relation rel)
         }
  
         rootblkno = metad->btm_root;
+       rootlevel = metad->btm_level;
  
         _bt_relbuf(rel, metabuf);       /* done with the meta page */
  
-       rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+       for (;;)
+       {
+               rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+               rootpage = BufferGetPage(rootbuf);
+               rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+               if (!P_IGNORE(rootopaque))
+                       break;
+
+               /* it's dead, Jim.  step right one page */
+               if (P_RIGHTMOST(rootopaque))
+                       elog(ERROR, "No live root page found in %s",
+                                RelationGetRelationName(rel));
+               rootblkno = rootopaque->btpo_next;
+
+               _bt_relbuf(rel, rootbuf);
+       }
+
+       /* Note: can't check btpo.level on deleted pages */
+       if (rootopaque->btpo.level != rootlevel)
+               elog(ERROR, "Root page %u of %s has level %u, expected %u",
+                        rootblkno, RelationGetRelationName(rel),
+                        rootopaque->btpo.level, rootlevel);
  
         return rootbuf;
  }
@@ -342,6 +369,8 @@ _bt_gettrueroot(Relation rel)
  /*
   *     _bt_getbuf() -- Get a buffer by block number for read or write.
   *
+ *             blkno == P_NEW means to get an unallocated index page.
+ *
   *             When this routine returns, the appropriate lock is set on the
   *             requested buffer and its reference count has been incremented
   *             (ie, the buffer is "locked and pinned").
@@ -359,18 +388,35 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
         }
         else
         {
+               bool            needLock;
                 Page            page;
  
+               /* XXX soon: ask FSM about free space */
+
                 /*
                  * Extend the relation by one page.
                  *
-                * Extend bufmgr code is unclean and so we have to use extra locking
-                * here.
+                * We have to use a lock to ensure no one else is extending the rel at
+                * the same time, else we will both try to initialize the same new
+                * page.  We can skip locking for new or temp relations, however,
+                * since no one else could be accessing them.
                  */
-               LockPage(rel, 0, ExclusiveLock);
-               buf = ReadBuffer(rel, blkno);
+               needLock = !(rel->rd_isnew || rel->rd_istemp);
+
+               if (needLock)
+                       LockPage(rel, 0, ExclusiveLock);
+
+               buf = ReadBuffer(rel, P_NEW);
+
+               /*
+                * Release the file-extension lock; it's now OK for someone else to
+                * extend the relation some more.
+                */
+               if (needLock)
+                       UnlockPage(rel, 0, ExclusiveLock);
+
+               /* Acquire appropriate buffer lock on new page */
                 LockBuffer(buf, access);
-               UnlockPage(rel, 0, ExclusiveLock);
  
                 /* Initialize the new page before returning it */
                 page = BufferGetPage(buf);
@@ -403,10 +449,9 @@ _bt_relbuf(Relation rel, Buffer buf)
   *             and a pin on the buffer.
   *
   * NOTE: actually, the buffer manager just marks the shared buffer page
- * dirty here, the real I/O happens later.     Since we can't persuade the
- * Unix kernel to schedule disk writes in a particular order, there's not
- * much point in worrying about this.  The most we can say is that all the
- * writes will occur before commit.
+ * dirty here; the real I/O happens later.  This is okay since we are not
+ * relying on write ordering anyway.  The WAL mechanism is responsible for
+ * guaranteeing correctness after a crash.
   */
  void
  _bt_wrtbuf(Relation rel, Buffer buf)
@@ -455,8 +500,9 @@ _bt_pageinit(Page page, Size size)
   *             mistake.  On exit, metapage data is correct and we no longer have
   *             a pin or lock on the metapage.
   *
- * XXX this is not used for splitting anymore, only in nbtsort.c at the
- * completion of btree building.
+ * Actually this is not used for splitting on-the-fly anymore.  It's only used
+ * in nbtsort.c at the completion of btree building, where we know we have
+ * sole access to the index anyway.
   */
  void
  _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
@@ -512,6 +558,10 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
  /*
   * Delete an item from a btree page.
   *
+ * This must only be used for deleting leaf items.  Deleting an item on a
+ * non-leaf page has to be done as part of an atomic action that includes
+ * deleting the page it points to.
+ *
   * This routine assumes that the caller has pinned and locked the buffer,
   * and will write the buffer afterwards.
   */
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index de6765415fefb9d6e492d0a09e3c2cb4dec60401..c7f23da4c7a8ab927cbfdab71493e7212fee34d4 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.96 2003/02/22 00:45:04 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -23,6 +23,7 @@
  #include "access/nbtree.h"
  #include "catalog/index.h"
  #include "miscadmin.h"
+#include "storage/freespace.h"
  
  
  /* Working state for btbuild and its callback */
@@ -44,7 +45,6 @@ typedef struct
  } BTBuildState;
  
  
-bool           BuildingBtree = false;          /* see comment in btbuild() */
  bool           FastBuild = true;       /* use SORT instead of insertion build */
  
  /*
@@ -68,13 +68,7 @@ static void btbuildCallback(Relation index,
  void
  AtEOXact_nbtree(void)
  {
-       /*
-        * Note: these actions should only be necessary during xact abort; but
-        * they can't hurt during a commit.
-        */
-
-       /* If we were building a btree, we ain't anymore. */
-       BuildingBtree = false;
+       /* nothing to do at the moment */
  }
  
  
@@ -95,9 +89,6 @@ btbuild(PG_FUNCTION_ARGS)
         double          reltuples;
         BTBuildState buildstate;
  
-       /* set flag to disable locking */
-       BuildingBtree = true;
-
         /*
          * bootstrap processing does something strange, so don't use
          * sort/build for initial catalog indices.      at some point i need to
@@ -172,9 +163,6 @@ btbuild(PG_FUNCTION_ARGS)
         }
  #endif   /* BTREE_BUILD_STATS */
  
-       /* all done */
-       BuildingBtree = false;
-
         /*
          * Since we just counted the tuples in the heap, we update its stats
          * in pg_class to guarantee that the planner takes advantage of the
@@ -689,10 +677,6 @@ btbulkdelete(PG_FUNCTION_ARGS)
                                  * We now need to back up the scan one item, so that the next
                                  * cycle will re-examine the same offnum on this page (which
                                  * now holds the next item).
-                                *
-                                * For now, just hack the current-item index.  Will need to
-                                * be smarter when deletion includes removal of empty
-                                * index pages.
                                  */
                                 current->ip_posid--;
                         }
@@ -708,12 +692,89 @@ btbulkdelete(PG_FUNCTION_ARGS)
  
         result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
         result->num_pages = num_pages;
-       result->tuples_removed = tuples_removed;
         result->num_index_tuples = num_index_tuples;
+       result->tuples_removed = tuples_removed;
+       result->pages_free = 0;         /* not computed here */
  
         PG_RETURN_POINTER(result);
  }
  
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Here, we scan looking for pages we can delete or return to the freelist.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+btvacuumcleanup(PG_FUNCTION_ARGS)
+{
+       Relation        rel = (Relation) PG_GETARG_POINTER(0);
+#ifdef NOT_USED
+       IndexVacuumCleanupInfo *info = (IndexVacuumCleanupInfo *) PG_GETARG_POINTER(1);
+#endif
+       IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2);
+       BlockNumber num_pages;
+       BlockNumber blkno;
+       PageFreeSpaceInfo *pageSpaces;
+       int                     nFreePages,
+                               maxFreePages;
+
+       Assert(stats != NULL);
+
+       num_pages = RelationGetNumberOfBlocks(rel);
+
+       /* No point in remembering more than MaxFSMPages pages */
+       maxFreePages = MaxFSMPages;
+       if ((BlockNumber) maxFreePages > num_pages)
+               maxFreePages = (int) num_pages + 1;     /* +1 to avoid palloc(0) */
+       pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo));
+       nFreePages = 0;
+
+       /*
+        * Scan through all pages of index, except metapage.  (Any pages added
+        * after we start the scan will not be examined; this should be fine,
+        * since they can't possibly be empty.)
+        */
+       for (blkno = BTREE_METAPAGE+1; blkno < num_pages; blkno++)
+       {
+               Buffer  buf;
+               Page    page;
+               BTPageOpaque opaque;
+
+               buf = _bt_getbuf(rel, blkno, BT_READ);
+               page = BufferGetPage(buf);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               if (P_ISDELETED(opaque))
+               {
+                       /* XXX if safe-to-reclaim... */
+                       if (nFreePages < maxFreePages)
+                       {
+                               pageSpaces[nFreePages].blkno = blkno;
+                               /* The avail-space value is bogus, but must be < BLCKSZ */
+                               pageSpaces[nFreePages].avail = BLCKSZ-1;
+                               nFreePages++;
+                       }
+               }
+               _bt_relbuf(rel, buf);
+       }
+
+       /*
+        * Update the shared Free Space Map with the info we now have about
+        * free space in the index, discarding any old info the map may have.
+        * We do not need to sort the page numbers; they're in order already.
+        */
+       MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces);
+
+       pfree(pageSpaces);
+
+       /* update statistics */
+       stats->num_pages = num_pages;
+       stats->pages_free = nFreePages;
+
+       PG_RETURN_POINTER(stats);
+}
+
  /*
   * Restore scan position when btgettuple is called to continue a scan.
   *
@@ -739,7 +800,7 @@ _bt_restscan(IndexScanDesc scan)
                                 maxoff;
         BTPageOpaque opaque;
         Buffer          nextbuf;
-       ItemPointerData target = so->curHeapIptr;
+       ItemPointer target = &(so->curHeapIptr);
         BTItem          item;
         BlockNumber blkno;
  
@@ -759,7 +820,7 @@ _bt_restscan(IndexScanDesc scan)
          * current->ip_posid before first index tuple on the current page
          * (_bt_step will move it right)...  XXX still needed?
          */
-       if (!ItemPointerIsValid(&target))
+       if (!ItemPointerIsValid(target))
         {
                 ItemPointerSetOffsetNumber(current,
                                                            OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
@@ -778,11 +839,7 @@ _bt_restscan(IndexScanDesc scan)
                          offnum = OffsetNumberNext(offnum))
                 {
                         item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-                       if (item->bti_itup.t_tid.ip_blkid.bi_hi ==
-                               target.ip_blkid.bi_hi &&
-                               item->bti_itup.t_tid.ip_blkid.bi_lo ==
-                               target.ip_blkid.bi_lo &&
-                               item->bti_itup.t_tid.ip_posid == target.ip_posid)
+                       if (BTTidSame(item->bti_itup.t_tid, *target))
                         {
                                 /* Found it */
                                 current->ip_posid = offnum;
@@ -793,22 +850,33 @@ _bt_restscan(IndexScanDesc scan)
                 /*
                  * The item we're looking for moved right at least one page, so
                  * move right.  We are careful here to pin and read-lock the next
-                * page before releasing the current one.  This ensures that a
-                * concurrent btbulkdelete scan cannot pass our position --- if it
+                * non-dead page before releasing the current one.  This ensures that
+                * a concurrent btbulkdelete scan cannot pass our position --- if it
                  * did, it might be able to reach and delete our target item before
                  * we can find it again.
                  */
                 if (P_RIGHTMOST(opaque))
-                       elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!"
+                       elog(ERROR, "_bt_restscan: my bits moved right off the end of the world!"
                                  "\n\tRecreate index %s.", RelationGetRelationName(rel));
-
-               blkno = opaque->btpo_next;
-               nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+               /* Advance to next non-dead page --- there must be one */
+               nextbuf = InvalidBuffer;
+               for (;;)
+               {
+                       blkno = opaque->btpo_next;
+                       if (nextbuf != InvalidBuffer)
+                               _bt_relbuf(rel, nextbuf);
+                       nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+                       page = BufferGetPage(nextbuf);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       if (!P_IGNORE(opaque))
+                               break;
+                       if (P_RIGHTMOST(opaque))
+                               elog(ERROR, "_bt_restscan: fell off the end of %s",
+                                        RelationGetRelationName(rel));
+               }
                 _bt_relbuf(rel, buf);
                 so->btso_curbuf = buf = nextbuf;
-               page = BufferGetPage(buf);
                 maxoff = PageGetMaxOffsetNumber(page);
-               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                 offnum = P_FIRSTDATAKEY(opaque);
                 ItemPointerSet(current, blkno, offnum);
         }
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index 0daae3cd58686d12784074686997785f9035dfb9..91089d854545461e923da7de140b8071dfc8fdb3 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1,14 +1,14 @@
  /*-------------------------------------------------------------------------
   *
   * nbtsearch.c
- *       search code for postgres btrees.
+ *       Search code for postgres btrees.
   *
   *
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.74 2003/02/22 00:45:04 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -19,6 +19,7 @@
  #include "access/nbtree.h"
  
  
+static Buffer _bt_walk_left(Relation rel, Buffer buf);
  static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  
  
@@ -79,10 +80,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
                 par_blkno = BufferGetBlockNumber(*bufP);
  
                 /*
-                * We need to save the bit image of the index entry we chose in
+                * We need to save the location of the index entry we chose in
                  * the parent page on a stack. In case we split the tree, we'll
-                * use this bit image to figure out what our real parent page is,
-                * in case the parent splits while we're working lower in the
+                * use the stack to work back up to the parent page.  We also save
+                * the actual downlink (TID) to uniquely identify the index entry,
+                * in case it moves right while we're working lower in the
                  * tree.  See the paper by Lehman and Yao for how this is detected
                  * and handled. (We use the child link to disambiguate duplicate
                  * keys in the index -- Lehman and Yao disallow duplicate keys.)
@@ -114,7 +116,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
  /*
   *     _bt_moveright() -- move right in the btree if necessary.
   *
- *             When we drop and reacquire a pointer to a page, it is possible that
+ *             When we follow a pointer to reach a page, it is possible that
   *             the page has changed in the meanwhile.  If this happens, we're
   *             guaranteed that the page has "split right" -- that is, that any
   *             data that appeared on the page originally is either on the page
@@ -148,9 +150,13 @@ _bt_moveright(Relation rel,
          * right.  (If the scan key is equal to the high key, we might or
          * might not need to move right; have to scan the page first anyway.)
          * It could even have split more than once, so scan as far as needed.
+        *
+        * We also have to move right if we followed a link that brought us to
+        * a dead page.
          */
         while (!P_RIGHTMOST(opaque) &&
-                  _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
+                  (P_IGNORE(opaque) ||
+                       _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
         {
                 /* step right one page */
                 BlockNumber rblkno = opaque->btpo_next;
@@ -161,6 +167,10 @@ _bt_moveright(Relation rel,
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
         }
  
+       if (P_IGNORE(opaque))
+               elog(ERROR, "_bt_moveright: fell off the end of %s",
+                        RelationGetRelationName(rel));
+
         return buf;
  }
  
@@ -796,7 +806,6 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
         OffsetNumber offnum,
                                 maxoff;
         BlockNumber blkno;
-       BlockNumber obknum;
  
         /*
          * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
@@ -814,7 +823,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                         offnum = OffsetNumberNext(offnum);
                 else
                 {
-                       /* walk right to the next page with data */
+                       /* Walk right to the next page with data */
                         for (;;)
                         {
                                 /* if we're at end of scan, release the buffer and return */
@@ -831,58 +840,56 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                 *bufP = _bt_getbuf(rel, blkno, BT_READ);
                                 page = BufferGetPage(*bufP);
                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                               maxoff = PageGetMaxOffsetNumber(page);
-                               /* done if it's not empty */
-                               offnum = P_FIRSTDATAKEY(opaque);
-                               if (!PageIsEmpty(page) && offnum <= maxoff)
-                                       break;
+                               if (!P_IGNORE(opaque))
+                               {
+                                       maxoff = PageGetMaxOffsetNumber(page);
+                                       /* done if it's not empty */
+                                       offnum = P_FIRSTDATAKEY(opaque);
+                                       if (!PageIsEmpty(page) && offnum <= maxoff)
+                                               break;
+                               }
                         }
                 }
         }
-       else
+       else                                            /* backwards scan */
         {
                 if (offnum > P_FIRSTDATAKEY(opaque))
                         offnum = OffsetNumberPrev(offnum);
                 else
                 {
-                       /* walk left to the next page with data */
+                       /*
+                        * Walk left to the next page with data.  This is much more
+                        * complex than the walk-right case because of the possibility
+                        * that the page to our left splits while we are in flight to it,
+                        * plus the possibility that the page we were on gets deleted
+                        * after we leave it.  See nbtree/README for details.
+                        */
                         for (;;)
                         {
-                               /* if we're at end of scan, release the buffer and return */
-                               if (P_LEFTMOST(opaque))
+                               *bufP = _bt_walk_left(rel, *bufP);
+
+                               /* if we're at end of scan, return failure */
+                               if (*bufP == InvalidBuffer)
                                 {
-                                       _bt_relbuf(rel, *bufP);
                                         ItemPointerSetInvalid(current);
-                                       *bufP = so->btso_curbuf = InvalidBuffer;
+                                       so->btso_curbuf = InvalidBuffer;
                                         return false;
                                 }
-                               /* step left */
-                               obknum = BufferGetBlockNumber(*bufP);
-                               blkno = opaque->btpo_prev;
-                               _bt_relbuf(rel, *bufP);
-                               *bufP = _bt_getbuf(rel, blkno, BT_READ);
                                 page = BufferGetPage(*bufP);
                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
                                 /*
-                                * If the adjacent page just split, then we have to walk
-                                * right to find the block that's now adjacent to where we
-                                * were.  Because pages only split right, we don't have to
-                                * worry about this failing to terminate.
+                                * Okay, we managed to move left to a non-deleted page.
+                                * Done if it's not half-dead and not empty.  Else loop back
+                                * and do it all again.
                                  */
-                               while (opaque->btpo_next != obknum)
+                               if (!P_IGNORE(opaque))
                                 {
-                                       blkno = opaque->btpo_next;
-                                       _bt_relbuf(rel, *bufP);
-                                       *bufP = _bt_getbuf(rel, blkno, BT_READ);
-                                       page = BufferGetPage(*bufP);
-                                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                                       maxoff = PageGetMaxOffsetNumber(page);
+                                       offnum = maxoff;
+                                       if (!PageIsEmpty(page) &&
+                                               maxoff >= P_FIRSTDATAKEY(opaque))
+                                               break;
                                 }
-                               /* done if it's not empty */
-                               maxoff = PageGetMaxOffsetNumber(page);
-                               offnum = maxoff;
-                               if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
-                                       break;
                         }
                 }
         }
@@ -895,11 +902,133 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
         return true;
  }
  
+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked.  This will be dropped
+ * before stepping left.  On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf)
+{
+       Page            page;
+       BTPageOpaque opaque;
+
+       page = BufferGetPage(buf);
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+       for (;;)
+       {
+               BlockNumber obknum;
+               BlockNumber lblkno;
+               BlockNumber blkno;
+               int                     tries;
+
+               /* if we're at end of tree, release buf and return failure */
+               if (P_LEFTMOST(opaque))
+               {
+                       _bt_relbuf(rel, buf);
+                       break;
+               }
+               /* remember original page we are stepping left from */
+               obknum = BufferGetBlockNumber(buf);
+               /* step left */
+               blkno = lblkno = opaque->btpo_prev;
+               _bt_relbuf(rel, buf);
+               buf = _bt_getbuf(rel, blkno, BT_READ);
+               page = BufferGetPage(buf);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               /*
+                * If this isn't the page we want, walk right till we find
+                * what we want --- but go no more than four hops (an
+                * arbitrary limit).  If we don't find the correct page by then,
+                * the most likely bet is that the original page got deleted
+                * and isn't in the sibling chain at all anymore, not that its
+                * left sibling got split more than four times.
+                *
+                * Note that it is correct to test P_ISDELETED not P_IGNORE
+                * here, because half-dead pages are still in the sibling
+                * chain.  Caller must reject half-dead pages if wanted.
+                */
+               tries = 0;
+               for (;;)
+               {
+                       if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+                       {
+                               /* Found desired page, return it */
+                               return buf;
+                       }
+                       if (P_RIGHTMOST(opaque) || ++tries > 4)
+                               break;
+                       blkno = opaque->btpo_next;
+                       _bt_relbuf(rel, buf);
+                       buf = _bt_getbuf(rel, blkno, BT_READ);
+                       page = BufferGetPage(buf);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               }
+
+               /* Return to the original page to see what's up */
+               _bt_relbuf(rel, buf);
+               buf = _bt_getbuf(rel, obknum, BT_READ);
+               page = BufferGetPage(buf);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               if (P_ISDELETED(opaque))
+               {
+                       /*
+                        * It was deleted.  Move right to first nondeleted page
+                        * (there must be one); that is the page that has acquired the
+                        * deleted one's keyspace, so stepping left from it will take
+                        * us where we want to be.
+                        */
+                       for (;;)
+                       {
+                               if (P_RIGHTMOST(opaque))
+                                       elog(ERROR, "_bt_walk_left: fell off the end of %s",
+                                                RelationGetRelationName(rel));
+                               blkno = opaque->btpo_next;
+                               _bt_relbuf(rel, buf);
+                               buf = _bt_getbuf(rel, blkno, BT_READ);
+                               page = BufferGetPage(buf);
+                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                               if (!P_ISDELETED(opaque))
+                                       break;
+                       }
+                       /*
+                        * Now return to top of loop, resetting obknum to
+                        * point to this nondeleted page, and try again.
+                        */
+               }
+               else
+               {
+                       /*
+                        * It wasn't deleted; the explanation had better be
+                        * that the page to the left got split or deleted.
+                        * Without this check, we'd go into an infinite loop
+                        * if there's anything wrong.
+                        */
+                       if (opaque->btpo_prev == lblkno)
+                               elog(ERROR, "_bt_walk_left: can't find left sibling in %s",
+                                        RelationGetRelationName(rel));
+                       /* Okay to try again with new lblkno value */
+               }
+       }
+
+       return InvalidBuffer;
+}
+
  /*
   * _bt_get_endpoint() -- Find the first or last page on a given tree level
   *
   * If the index is empty, we will return InvalidBuffer; any other failure
- * condition causes elog().
+ * condition causes elog().  We will not return a dead page.
   *
   * The returned buffer is pinned and read-locked.
   */
@@ -941,12 +1070,13 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                  * step right if needed to get to it (this could happen if the
                  * page split since we obtained a pointer to it).
                  */
-               while (P_ISDELETED(opaque) ||
+               while (P_IGNORE(opaque) ||
                            (rightmost && !P_RIGHTMOST(opaque)))
                 {
                         blkno = opaque->btpo_next;
                         if (blkno == P_NONE)
-                               elog(ERROR, "_bt_get_endpoint: ran off end of btree");
+                               elog(ERROR, "_bt_get_endpoint: fell off the end of %s",
+                                        RelationGetRelationName(rel));
                         _bt_relbuf(rel, buf);
                         buf = _bt_getbuf(rel, blkno, BT_READ);
                         page = BufferGetPage(buf);
@@ -959,7 +1089,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                 if (opaque->btpo.level < level)
                         elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
  
-               /* Step to leftmost or rightmost child page */
+               /* Descend to leftmost or rightmost child page */
                 if (rightmost)
                         offnum = PageGetMaxOffsetNumber(page);
                 else
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index f9d227ecd0fb28247e0088a5fb58c7f14d728921..62f020086d8186ff2815239c0a4f38226fa44253 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1,4 +1,5 @@
  /*-------------------------------------------------------------------------
+ *
   * nbtsort.c
   *             Build a btree from sorted input by loading leaf pages sequentially.
   *
@@ -35,7 +36,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.72 2003/02/22 00:45:04 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -164,8 +165,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
                 ResetUsage();
         }
  #endif   /* BTREE_BUILD_STATS */
-       tuplesort_performsort(btspool->sortstate);
  
+       tuplesort_performsort(btspool->sortstate);
         if (btspool2)
                 tuplesort_performsort(btspool2->sortstate);
         _bt_load(btspool->index, btspool, btspool2);
@@ -331,7 +332,7 @@ _bt_sortaddtup(Page page,
  
         if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
                                         LP_USED) == InvalidOffsetNumber)
-               elog(FATAL, "btree: failed to add item to the page in _bt_sort");
+               elog(ERROR, "btree: failed to add item to the page in _bt_sort");
  }
  
  /*----------
@@ -470,8 +471,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
  
                 /*
                  * Write out the old page.      We never want to see it again, so we
-                * can give up our lock (if we had one; most likely BuildingBtree
-                * is set, so we aren't locking).
+                * can give up our lock.
                  */
                 _bt_blwritepage(index, obuf);
  
@@ -534,7 +534,7 @@ _bt_uppershutdown(Relation index, BTPageState *state)
                 if (s->btps_next == (BTPageState *) NULL)
                 {
                         opaque->btpo_flags |= BTP_ROOT;
-                       _bt_metaproot(index, blkno, s->btps_level + 1);
+                       _bt_metaproot(index, blkno, s->btps_level);
                 }
                 else
                 {
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c

index 650820085cadd58ad18fc3b8b715f05b098257da..b6b2a19e10b32e190e66f1d282df13a70d44c20f 100644 (file)
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.75 2002/09/04 20:31:13 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.76 2003/02/22 00:45:04 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1250,8 +1250,9 @@ rtbulkdelete(PG_FUNCTION_ARGS)
  
         result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
         result->num_pages = num_pages;
-       result->tuples_removed = tuples_removed;
         result->num_index_tuples = num_index_tuples;
+       result->tuples_removed = tuples_removed;
+       result->pages_free = 0;
  
         PG_RETURN_POINTER(result);
  }
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c

index c1b17bba86b2a1ad12b43a4f7a4bdc1ee786493d..ac45a5df69d457c02cb79a2b384f6af9559f8500 100644 (file)
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.247 2003/02/09 06:56:27 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.248 2003/02/22 00:45:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -2603,17 +2603,25 @@ static void
  scan_index(Relation indrel, double num_tuples)
  {
         IndexBulkDeleteResult *stats;
+       IndexVacuumCleanupInfo vcinfo;
         VacRUsage       ru0;
  
         vac_init_rusage(&ru0);
  
         /*
-        * Even though we're not planning to delete anything, use the
-        * ambulkdelete call, so that the scan happens within the index AM for
-        * more speed.
+        * Even though we're not planning to delete anything, we use the
+        * ambulkdelete call, because (a) the scan happens within the index AM
+        * for more speed, and (b) it may want to pass private statistics to
+        * the amvacuumcleanup call.
          */
         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
  
+       /* Do post-VACUUM cleanup, even though we deleted nothing */
+       vcinfo.vacuum_full = true;
+       vcinfo.message_level = elevel;
+
+       stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
         if (!stats)
                 return;
  
@@ -2622,9 +2630,9 @@ scan_index(Relation indrel, double num_tuples)
                                                 stats->num_pages, stats->num_index_tuples,
                                                 false);
  
-       elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+       elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
                  RelationGetRelationName(indrel),
-                stats->num_pages, stats->num_index_tuples,
+                stats->num_pages, stats->pages_free, stats->num_index_tuples,
                  vac_show_rusage(&ru0));
  
         /*
@@ -2661,6 +2669,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
                          double num_tuples, int keep_tuples)
  {
         IndexBulkDeleteResult *stats;
+       IndexVacuumCleanupInfo vcinfo;
         VacRUsage       ru0;
  
         vac_init_rusage(&ru0);
@@ -2668,6 +2677,12 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
         /* Do bulk deletion */
         stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
  
+       /* Do post-VACUUM cleanup */
+       vcinfo.vacuum_full = true;
+       vcinfo.message_level = elevel;
+
+       stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
         if (!stats)
                 return;
  
@@ -2676,8 +2691,9 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
                                                 stats->num_pages, stats->num_index_tuples,
                                                 false);
  
-       elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
-                RelationGetRelationName(indrel), stats->num_pages,
+       elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+                RelationGetRelationName(indrel),
+                stats->num_pages, stats->pages_free,
                  stats->num_index_tuples - keep_tuples, stats->tuples_removed,
                  vac_show_rusage(&ru0));
  
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c

index 2974eb1bc3fa577ae89f57c3525a707f0e54ff11..9790ef30bc43a7b81e7f1d6e3599a494129ed8ab 100644 (file)
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -31,7 +31,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.23 2002/11/13 00:39:46 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.24 2003/02/22 00:45:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -200,7 +200,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                 tups_vacuumed,
                                 nkeep,
                                 nunused;
-       bool            did_vacuum_index = false;
         int                     i;
         VacRUsage       ru0;
  
@@ -244,7 +243,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         /* Remove index entries */
                         for (i = 0; i < nindexes; i++)
                                 lazy_vacuum_index(Irel[i], vacrelstats);
-                       did_vacuum_index = true;
                         /* Remove tuples from heap */
                         lazy_vacuum_heap(onerel, vacrelstats);
                         /* Forget the now-vacuumed tuples, and press on */
@@ -415,7 +413,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
         vacrelstats->rel_tuples = num_tuples;
  
         /* If any tuples need to be deleted, perform final vacuum cycle */
-       /* XXX put a threshold on min nuber of tuples here? */
+       /* XXX put a threshold on min number of tuples here? */
         if (vacrelstats->num_dead_tuples > 0)
         {
                 /* Remove index entries */
@@ -424,9 +422,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 /* Remove tuples from heap */
                 lazy_vacuum_heap(onerel, vacrelstats);
         }
-       else if (!did_vacuum_index)
+       else
         {
-               /* Scan indexes just to update pg_class statistics about them */
+               /* Must do post-vacuum cleanup and statistics update anyway */
                 for (i = 0; i < nindexes; i++)
                         lazy_scan_index(Irel[i], vacrelstats);
         }
@@ -551,42 +549,36 @@ static void
  lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
  {
         IndexBulkDeleteResult *stats;
+       IndexVacuumCleanupInfo vcinfo;
         VacRUsage       ru0;
  
         vac_init_rusage(&ru0);
  
         /*
-        * If the index is not partial, skip the scan, and just assume it has
-        * the same number of tuples as the heap.
-        */
-       if (!vac_is_partial_index(indrel))
-       {
-               vac_update_relstats(RelationGetRelid(indrel),
-                                                       RelationGetNumberOfBlocks(indrel),
-                                                       vacrelstats->rel_tuples,
-                                                       false);
-               return;
-       }
-
-       /*
-        * If index is unsafe for concurrent access, must lock it; but a
-        * shared lock should be sufficient.
+        * If index is unsafe for concurrent access, must lock it.
          */
         if (!indrel->rd_am->amconcurrent)
-               LockRelation(indrel, AccessShareLock);
+               LockRelation(indrel, AccessExclusiveLock);
  
         /*
-        * Even though we're not planning to delete anything, use the
-        * ambulkdelete call, so that the scan happens within the index AM for
-        * more speed.
+        * Even though we're not planning to delete anything, we use the
+        * ambulkdelete call, because (a) the scan happens within the index AM
+        * for more speed, and (b) it may want to pass private statistics to
+        * the amvacuumcleanup call.
          */
         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
  
+       /* Do post-VACUUM cleanup, even though we deleted nothing */
+       vcinfo.vacuum_full = false;
+       vcinfo.message_level = elevel;
+
+       stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
         /*
          * Release lock acquired above.
          */
         if (!indrel->rd_am->amconcurrent)
-               UnlockRelation(indrel, AccessShareLock);
+               UnlockRelation(indrel, AccessExclusiveLock);
  
         if (!stats)
                 return;
@@ -596,9 +588,9 @@ lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
                                                 stats->num_pages, stats->num_index_tuples,
                                                 false);
  
-       elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+       elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
                  RelationGetRelationName(indrel),
-                stats->num_pages, stats->num_index_tuples,
+                stats->num_pages, stats->pages_free, stats->num_index_tuples,
                  vac_show_rusage(&ru0));
  
         pfree(stats);
@@ -617,6 +609,7 @@ static void
  lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats)
  {
         IndexBulkDeleteResult *stats;
+       IndexVacuumCleanupInfo vcinfo;
         VacRUsage       ru0;
  
         vac_init_rusage(&ru0);
@@ -630,26 +623,33 @@ lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats)
         /* Do bulk deletion */
         stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);
  
+       /* Do post-VACUUM cleanup */
+       vcinfo.vacuum_full = false;
+       vcinfo.message_level = elevel;
+
+       stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
         /*
          * Release lock acquired above.
          */
         if (!indrel->rd_am->amconcurrent)
                 UnlockRelation(indrel, AccessExclusiveLock);
  
+       if (!stats)
+               return;
+
         /* now update statistics in pg_class */
-       if (stats)
-       {
-               vac_update_relstats(RelationGetRelid(indrel),
-                                                       stats->num_pages, stats->num_index_tuples,
-                                                       false);
+       vac_update_relstats(RelationGetRelid(indrel),
+                                               stats->num_pages, stats->num_index_tuples,
+                                               false);
  
-               elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
-                        RelationGetRelationName(indrel), stats->num_pages,
-                        stats->num_index_tuples, stats->tuples_removed,
-                        vac_show_rusage(&ru0));
+       elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+                RelationGetRelationName(indrel),
+                stats->num_pages, stats->pages_free,
+                stats->num_index_tuples, stats->tuples_removed,
+                vac_show_rusage(&ru0));
  
-               pfree(stats);
-       }
+       pfree(stats);
  }
  
  /*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h

index 6266da47c8ff1a2e86d2593a608148b1fbdb5996..59ecf1d8f4f6fe1e917b71c43c642f1881d1ec3a 100644 (file)
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: genam.h,v 1.37 2002/09/04 20:31:36 momjian Exp $
+ * $Id: genam.h,v 1.38 2003/02/22 00:45:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -20,17 +20,32 @@
  #include "nodes/primnodes.h"
  
  
-/* Struct for statistics returned by bulk-delete operation */
+/*
+ * Struct for statistics returned by bulk-delete operation
+ *
+ * This is now also passed to the index AM's vacuum-cleanup operation,
+ * if it has one, which can modify the results as needed.  Note that
+ * an index AM could choose to have bulk-delete return a larger struct
+ * of which this is just the first field; this provides a way for bulk-delete
+ * to communicate additional private data to vacuum-cleanup.
+ */
  typedef struct IndexBulkDeleteResult
  {
         BlockNumber num_pages;          /* pages remaining in index */
+       double          num_index_tuples;               /* tuples remaining */
         double          tuples_removed; /* # removed by bulk-delete operation */
-       double          num_index_tuples;               /* # remaining */
+       BlockNumber     pages_free;             /* # unused pages in index */
  } IndexBulkDeleteResult;
  
  /* Typedef for callback function to determine if a tuple is bulk-deletable */
  typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
  
+/* Struct for additional arguments passed to vacuum-cleanup operation */
+typedef struct IndexVacuumCleanupInfo
+{
+       bool            vacuum_full;    /* VACUUM FULL (we have exclusive lock) */
+       int                     message_level;  /* elog level for progress messages */
+} IndexVacuumCleanupInfo;
  
  /* Struct for heap-or-index scans of system tables */
  typedef struct SysScanDescData
@@ -72,6 +87,9 @@ extern bool index_getnext_indexitem(IndexScanDesc scan,
  extern IndexBulkDeleteResult *index_bulk_delete(Relation indexRelation,
                                   IndexBulkDeleteCallback callback,
                                   void *callback_state);
+extern IndexBulkDeleteResult *index_vacuum_cleanup(Relation indexRelation,
+                                 IndexVacuumCleanupInfo *info,
+                                 IndexBulkDeleteResult *stats);
  extern RegProcedure index_cost_estimator(Relation indexRelation);
  extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
                                 uint16 procnum);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index f4dce1842f18b52d80e607acf6c1d8cb4f1491f7..4bb5db0513e872e7fbbd1071b18d5a33ee8e0d7d 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
+ * $Id: nbtree.h,v 1.65 2003/02/22 00:45:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -54,6 +54,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
  #define BTP_ROOT               (1 << 1)        /* root page (has no parent) */
  #define BTP_DELETED            (1 << 2)        /* page has been deleted from tree */
  #define BTP_META               (1 << 3)        /* meta-page */
+#define BTP_HALF_DEAD  (1 << 4)        /* empty, but still in tree */
  
  
  /*
@@ -124,12 +125,13 @@ typedef BTItemData *BTItem;
  #define SizeOfBTItem   sizeof(BTItemData)
  
  /* Test whether items are the "same" per the above notes */
-#define BTItemSame(i1, i2)       ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
-                                                               (i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
-                                                               (i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
-                                                               (i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
-                                                               (i1)->bti_itup.t_tid.ip_posid == \
-                                                               (i2)->bti_itup.t_tid.ip_posid )
+#define BTTidSame(i1, i2)      \
+       ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
+         (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
+         (i1).ip_posid == (i2).ip_posid )
+#define BTItemSame(i1, i2)     \
+       BTTidSame((i1)->bti_itup.t_tid, (i2)->bti_itup.t_tid)
+
  
  /*
   *     In general, the btree code tries to localize its knowledge about
@@ -150,6 +152,7 @@ typedef BTItemData *BTItem;
  #define P_ISLEAF(opaque)               ((opaque)->btpo_flags & BTP_LEAF)
  #define P_ISROOT(opaque)               ((opaque)->btpo_flags & BTP_ROOT)
  #define P_ISDELETED(opaque)            ((opaque)->btpo_flags & BTP_DELETED)
+#define P_IGNORE(opaque)               ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
  
  /*
   *     Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -412,8 +415,6 @@ typedef BTScanOpaqueData *BTScanOpaque;
  /*
   * prototypes for functions in nbtree.c (external entry points for btree)
   */
-extern bool BuildingBtree;             /* in nbtree.c */
-
  extern void AtEOXact_nbtree(void);
  
  extern Datum btbuild(PG_FUNCTION_ARGS);
@@ -426,6 +427,7 @@ extern Datum btendscan(PG_FUNCTION_ARGS);
  extern Datum btmarkpos(PG_FUNCTION_ARGS);
  extern Datum btrestrpos(PG_FUNCTION_ARGS);
  extern Datum btbulkdelete(PG_FUNCTION_ARGS);
+extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
  
  /*
   * prototypes for functions in nbtinsert.c
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index a1be9bacf3bc29706af66ed3b095a1e08a766803..cb2e6e523df7106dbf43a256b7d737aff6591dc0 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
+ * $Id: xlog.h,v 1.42 2003/02/22 00:45:05 tgl Exp $
   */
  #ifndef XLOG_H
  #define XLOG_H
@@ -56,17 +56,18 @@ typedef struct XLogRecord
  #define XLR_INFO_MASK                  0x0F
  
  /*
- * We support backup of up to 2 disk blocks per XLOG record (could support
- * more if we cared to dedicate more xl_info bits for this purpose; currently
- * do not need more than 2 anyway).  If we backed up any disk blocks then we
- * use flag bits in xl_info to signal it.
+ * If we backed up any disk blocks with the XLOG record, we use flag bits in
+ * xl_info to signal it.  We support backup of up to 3 disk blocks per XLOG
+ * record.  (Could support 4 if we cared to dedicate all the xl_info bits for
+ * this purpose; currently bit 0 of xl_info is unused and available.)
   */
-#define XLR_BKP_BLOCK_MASK             0x0C    /* all info bits used for bkp
+#define XLR_BKP_BLOCK_MASK             0x0E    /* all info bits used for bkp
                                                                                  * blocks */
-#define XLR_MAX_BKP_BLOCKS             2
+#define XLR_MAX_BKP_BLOCKS             3
  #define XLR_SET_BKP_BLOCK(iblk) (0x08 >> (iblk))
  #define XLR_BKP_BLOCK_1                        XLR_SET_BKP_BLOCK(0)    /* 0x08 */
  #define XLR_BKP_BLOCK_2                        XLR_SET_BKP_BLOCK(1)    /* 0x04 */
+#define XLR_BKP_BLOCK_3                        XLR_SET_BKP_BLOCK(2)    /* 0x02 */
  
  /*
   * Sometimes we log records which are out of transaction control.
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 240889577ae997089bb2deee1ca9aee792439a60..fc24db5d2e1a455bb9c711067aa1fe942c4561ef 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
+ * $Id: catversion.h,v 1.179 2003/02/22 00:45:05 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -53,6 +53,6 @@
   */
  
  /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200302171
+#define CATALOG_VERSION_NO     200302211
  
  #endif
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h

index 66b2f2621f1c83ae71e87e28f18404404e8b1ef8..3ee7121812cf0adb4c58930b76865735a6e79ef5 100644 (file)
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: pg_am.h,v 1.23 2002/07/29 22:14:11 tgl Exp $
+ * $Id: pg_am.h,v 1.24 2003/02/22 00:45:05 tgl Exp $
   *
   * NOTES
   *             the genbki.sh script reads this file and generates .bki
@@ -58,6 +58,7 @@ CATALOG(pg_am)
         regproc         amrestrpos;             /* "restore marked scan position" function */
         regproc         ambuild;                /* "build new index" function */
         regproc         ambulkdelete;   /* bulk-delete function */
+       regproc         amvacuumcleanup; /* post-VACUUM cleanup function */
         regproc         amcostestimate; /* estimate cost of an indexscan */
  } FormData_pg_am;
  
@@ -72,7 +73,7 @@ typedef FormData_pg_am *Form_pg_am;
   *             compiler constants for pg_am
   * ----------------
   */
-#define Natts_pg_am                                            19
+#define Natts_pg_am                                            20
  #define Anum_pg_am_amname                              1
  #define Anum_pg_am_amowner                             2
  #define Anum_pg_am_amstrategies                        3
@@ -91,21 +92,22 @@ typedef FormData_pg_am *Form_pg_am;
  #define Anum_pg_am_amrestrpos                  16
  #define Anum_pg_am_ambuild                             17
  #define Anum_pg_am_ambulkdelete                        18
-#define Anum_pg_am_amcostestimate              19
+#define Anum_pg_am_amvacuumcleanup             19
+#define Anum_pg_am_amcostestimate              20
  
  /* ----------------
   *             initial contents of pg_am
   * ----------------
   */
  
-DATA(insert OID = 402 (  rtree PGUID   8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete rtcostestimate ));
+DATA(insert OID = 402 (  rtree PGUID   8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete - rtcostestimate ));
  DESCR("r-tree index access method");
-DATA(insert OID = 403 (  btree PGUID   5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btcostestimate ));
+DATA(insert OID = 403 (  btree PGUID   5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash  PGUID   1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashcostestimate ));
+DATA(insert OID = 405 (  hash  PGUID   1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete - hashcostestimate ));
  DESCR("hash index access method");
-DATA(insert OID = 783 (  gist  PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistcostestimate ));
+DATA(insert OID = 783 (  gist  PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete - gistcostestimate ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
  
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h

index f32715284b0e79887359621ae38cad154228cd0f..3aab3ef8a736e8a350f9492e2e18eaf1ef122919 100644 (file)
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: pg_proc.h,v 1.283 2003/02/13 05:24:02 momjian Exp $
+ * $Id: pg_proc.h,v 1.284 2003/02/22 00:45:05 tgl Exp $
   *
   * NOTES
   *       The script catalog/genbki.sh reads this file and generates .bki
@@ -710,6 +710,8 @@ DATA(insert OID = 338 (  btbuild               PGNSP PGUID 12 f f t f v 3 2278 "2281 2281
  DESCR("btree(internal)");
  DATA(insert OID = 332 (  btbulkdelete     PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btbulkdelete - _null_ ));
  DESCR("btree(internal)");
+DATA(insert OID = 972 (  btvacuumcleanup   PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btvacuumcleanup - _null_ ));
+DESCR("btree(internal)");
  DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281"  btcostestimate - _null_ ));
  DESCR("btree(internal)");
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Feb 2003 00:45:05 +0000 (00:45 +0000)
doc/src/sgml/catalogs.sgml		patch \| blob \| history
src/backend/access/gist/gist.c		patch \| blob \| history
src/backend/access/hash/hash.c		patch \| blob \| history
src/backend/access/index/indexam.c		patch \| blob \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| history
src/backend/access/nbtree/nbtpage.c		patch \| blob \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| history
src/backend/access/nbtree/nbtsearch.c		patch \| blob \| history
src/backend/access/nbtree/nbtsort.c		patch \| blob \| history
src/backend/access/rtree/rtree.c		patch \| blob \| history
src/backend/commands/vacuum.c		patch \| blob \| history
src/backend/commands/vacuumlazy.c		patch \| blob \| history
src/include/access/genam.h		patch \| blob \| history
src/include/access/nbtree.h		patch \| blob \| history
src/include/access/xlog.h		patch \| blob \| history
src/include/catalog/catversion.h		patch \| blob \| history
src/include/catalog/pg_am.h		patch \| blob \| history
src/include/catalog/pg_proc.h		patch \| blob \| history