Repair problems with hash indexes that span multiple segments: the hash code's

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c

index fe5e5e95958704aa6fb219530c33bcbc5e0026f9..6dc51269a14ed9576722bd8f35ccb027ff2a18b4 100644 (file)
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.41 2003/09/04 22:06:27 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.41.2.1 2006/11/19 21:33:46 tgl Exp $
   *
   * NOTES
   *       Overflow pages look like ordinary relation pages.
@@ -20,7 +20,7 @@
  #include "access/hash.h"
  
  
-static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf);
+static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
  static uint32 _hash_firstfreebit(uint32 map);
  
  
@@ -99,18 +99,14 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
  Buffer
  _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
  {
-       BlockNumber ovflblkno;
         Buffer          ovflbuf;
         Page            page;
         Page            ovflpage;
         HashPageOpaque pageopaque;
         HashPageOpaque ovflopaque;
  
-       /* allocate an empty overflow page */
-       ovflblkno = _hash_getovflpage(rel, metabuf);
-
-       /* lock the overflow page */
-       ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+       /* allocate and lock an empty overflow page */
+       ovflbuf = _hash_getovflpage(rel, metabuf);
         ovflpage = BufferGetPage(ovflbuf);
  
         /*
@@ -149,7 +145,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
         _hash_wrtnorelbuf(rel, ovflbuf);
  
         /* logically chain overflow page to previous page */
-       pageopaque->hasho_nextblkno = ovflblkno;
+       pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
         _hash_wrtbuf(rel, buf);
  
         return ovflbuf;
@@ -158,16 +154,18 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
  /*
   *     _hash_getovflpage()
   *
- *     Find an available overflow page and return its block number.
+ *     Find an available overflow page and return it.  The returned buffer
+ *     is pinned and write-locked, but its contents are not initialized.
   *
   * The caller must hold a pin, but no lock, on the metapage buffer.
- * The buffer is returned in the same state.
+ * That buffer is left in the same state at exit.
   */
-static BlockNumber
+static Buffer
  _hash_getovflpage(Relation rel, Buffer metabuf)
  {
         HashMetaPage metap;
         Buffer          mapbuf = 0;
+       Buffer          newbuf;
         BlockNumber blkno;
         uint32          orig_firstfree;
         uint32          splitnum;
@@ -242,11 +240,10 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
                 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
         }
  
-       /* No Free Page Found - have to allocate a new page */
-       bit = metap->hashm_spares[splitnum];
-       metap->hashm_spares[splitnum]++;
-
-       /* Check if we need to allocate a new bitmap page */
+       /*
+        * No free pages --- have to extend the relation to add an overflow page.
+        * First, check to see if we have to add a new bitmap page too.
+        */
         if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
         {
                 /*
@@ -257,22 +254,39 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
                  * correctly marked "in use".  Subsequent pages do not exist yet,
                  * but it is convenient to pre-mark them as "in use" too.
                  */
-               _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
-
                 bit = metap->hashm_spares[splitnum];
+               _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
                 metap->hashm_spares[splitnum]++;
         }
         else
         {
                 /*
-                * Nothing to do here; since the page was past the last used page,
+                * Nothing to do here; since the page will be past the last used page,
                  * we know its bitmap bit was preinitialized to "in use".
                  */
         }
  
         /* Calculate address of the new overflow page */
+       bit = metap->hashm_spares[splitnum];
         blkno = bitno_to_blkno(metap, bit);
  
+       /*
+        * We have to fetch the page with P_NEW to ensure smgr's idea of the
+        * relation length stays in sync with ours.  XXX It's annoying to do this
+        * with metapage write lock held; would be better to use a lock that
+        * doesn't block incoming searches.  Best way to fix it would be to stop
+        * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the
+        * smgr relation length to track where new overflow pages come from;
+        * then we could release the metapage before we do the smgrextend.
+        * FIXME later (not in beta...)
+        */
+       newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+       if (BufferGetBlockNumber(newbuf) != blkno)
+               elog(ERROR, "unexpected hash relation size: %u, should be %u",
+                        BufferGetBlockNumber(newbuf), blkno);
+
+       metap->hashm_spares[splitnum]++;
+
         /*
          * Adjust hashm_firstfree to avoid redundant searches.  But don't
          * risk changing it if someone moved it while we were searching
@@ -284,7 +298,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
         /* Write updated metapage and release lock, but not pin */
         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
  
-       return blkno;
+       return newbuf;
  
  found:
         /* convert bit to bit number within page */
@@ -300,7 +314,7 @@ found:
         /* convert bit to absolute bit number */
         bit += (i << BMPG_SHIFT(metap));
  
-       /* Calculate address of the new overflow page */
+       /* Calculate address of the recycled overflow page */
         blkno = bitno_to_blkno(metap, bit);
  
         /*
@@ -321,7 +335,8 @@ found:
                 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
         }
  
-       return blkno;
+       /* Fetch and return the recycled page */
+       return _hash_getbuf(rel, blkno, HASH_WRITE);
  }
  
  /*
@@ -389,7 +404,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
         prevblkno = ovflopaque->hasho_prevblkno;
         bucket = ovflopaque->hasho_bucket;
  
-       /* Zero the page for debugging's sake; then write and release it */
+       /*
+        * Zero the page for debugging's sake; then write and release it.
+        * (Note: if we failed to zero the page here, we'd have problems
+        * with the Assert in _hash_pageinit() when the page is reused.)
+        */
         MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
         _hash_wrtbuf(rel, ovflbuf);
  
@@ -489,12 +508,19 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
         /*
          * It is okay to write-lock the new bitmap page while holding metapage
          * write lock, because no one else could be contending for the new page.
+        * Also, the metapage lock makes it safe to extend the index using P_NEW,
+        * which we want to do to ensure the smgr's idea of the relation size
+        * stays in step with ours.
          *
          * There is some loss of concurrency in possibly doing I/O for the new
          * page while holding the metapage lock, but this path is taken so
          * seldom that it's not worth worrying about.
          */
-       buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+       buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+       if (BufferGetBlockNumber(buf) != blkno)
+               elog(ERROR, "unexpected hash relation size: %u, should be %u",
+                        BufferGetBlockNumber(buf), blkno);
+
         pg = BufferGetPage(buf);
  
         /* initialize the page */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index 5b9d19acf1b9eadee8e19b81955c83f1cd3eaeb4..f34eeba536dd42afa89659a05e6602dff1ee094e 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42 2003/09/04 22:06:27 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42.2.1 2006/11/19 21:33:46 tgl Exp $
   *
   * NOTES
   *       Postgres hash pages look like ordinary relation pages.  The opaque
@@ -31,9 +31,11 @@
  #include "access/genam.h"
  #include "access/hash.h"
  #include "storage/lmgr.h"
+#include "storage/smgr.h"
  #include "utils/lsyscache.h"
  
  
+static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
  static void _hash_splitbucket(Relation rel, Buffer metabuf,
                                                           Bucket obucket, Bucket nbucket,
                                                           BlockNumber start_oblkno,
@@ -103,19 +105,14 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
   *             requested buffer and its reference count has been incremented
   *             (ie, the buffer is "locked and pinned").
   *
- *             XXX P_NEW is not used because, unlike the tree structures, we
- *             need the bucket blocks to be at certain block numbers.  we must
- *             depend on the caller to call _hash_pageinit on the block if it
- *             knows that this is a new block.
+ *             blkno == P_NEW is allowed, but it is caller's responsibility to
+ *             ensure that only one process can extend the index at a time.
   */
  Buffer
  _hash_getbuf(Relation rel, BlockNumber blkno, int access)
  {
         Buffer          buf;
  
-       if (blkno == P_NEW)
-               elog(ERROR, "hash AM does not use P_NEW");
-
         buf = ReadBuffer(rel, blkno);
  
         if (access != HASH_NOLOCK)
@@ -255,7 +252,14 @@ _hash_metapinit(Relation rel)
         if (ffactor < 10)
                 ffactor = 10;
  
-       metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+       /*
+        * We initialize the metapage, the first two bucket pages, and the
+        * first bitmap page in sequence, using P_NEW to cause smgrextend()
+        * calls to occur.  This ensures that the smgr level has the right
+        * idea of the physical index length.
+        */
+       metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+       Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
         pg = BufferGetPage(metabuf);
         _hash_pageinit(pg, BufferGetPageSize(metabuf));
  
@@ -308,7 +312,8 @@ _hash_metapinit(Relation rel)
          */
         for (i = 0; i <= 1; i++)
         {
-               buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
+               buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+               Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
                 pg = BufferGetPage(buf);
                 _hash_pageinit(pg, BufferGetPageSize(buf));
                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -321,8 +326,7 @@ _hash_metapinit(Relation rel)
         }
  
         /*
-        * Initialize first bitmap page.  Can't do this until we
-        * create the first two buckets, else smgr will complain.
+        * Initialize first bitmap page
          */
         _hash_initbitmap(rel, metap, 3);
  
@@ -357,6 +361,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
         Bucket          old_bucket;
         Bucket          new_bucket;
         uint32          spare_ndx;
+       BlockNumber firstblock = InvalidBlockNumber;
         BlockNumber start_oblkno;
         BlockNumber start_nblkno;
         uint32          maxbucket;
@@ -394,6 +399,40 @@ _hash_expandtable(Relation rel, Buffer metabuf)
                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
                 goto fail;
  
+       /*
+        * Can't split anymore if maxbucket has reached its maximum possible value.
+        *
+        * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
+        * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
+        * to half that because of overflow looping in _hash_log2() and
+        * insufficient space in hashm_spares[].  It's moot anyway because an
+        * index with 2^32 buckets would certainly overflow BlockNumber and
+        * hence _hash_alloc_buckets() would fail, but if we supported buckets
+        * smaller than a disk block then this would be an independent constraint.
+        */
+       if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
+               goto fail;
+
+       /*
+        * If the split point is increasing (hashm_maxbucket's log base 2
+        * increases), we need to allocate a new batch of bucket pages.
+        */
+       new_bucket = metap->hashm_maxbucket + 1;
+       spare_ndx = _hash_log2(new_bucket + 1);
+       if (spare_ndx > metap->hashm_ovflpoint)
+       {
+               Assert(spare_ndx == metap->hashm_ovflpoint + 1);
+               /*
+                * The number of buckets in the new splitpoint is equal to the
+                * total number already in existence, i.e. new_bucket.  Currently
+                * this maps one-to-one to blocks required, but someday we may need
+                * a more complicated calculation here.
+                */
+               firstblock = _hash_alloc_buckets(rel, new_bucket);
+               if (firstblock == InvalidBlockNumber)
+                       goto fail;                      /* can't split due to BlockNumber overflow */
+       }
+
         /*
          * Determine which bucket is to be split, and attempt to lock the old
          * bucket.  If we can't get the lock, give up.
@@ -407,7 +446,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
          * then lock.  This should be okay because no one else should be trying
          * to lock the new bucket yet...
          */
-       new_bucket = metap->hashm_maxbucket + 1;
         old_bucket = (new_bucket & metap->hashm_lowmask);
  
         start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
@@ -435,14 +473,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
          * increases), we need to adjust the hashm_spares[] array and
          * hashm_ovflpoint so that future overflow pages will be created beyond
          * this new batch of bucket pages.
-        *
-        * XXX should initialize new bucket pages to prevent out-of-order
-        * page creation?  Don't wanna do it right here though.
          */
-       spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
         if (spare_ndx > metap->hashm_ovflpoint)
         {
-               Assert(spare_ndx == metap->hashm_ovflpoint + 1);
                 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
                 metap->hashm_ovflpoint = spare_ndx;
         }
@@ -450,6 +483,12 @@ _hash_expandtable(Relation rel, Buffer metabuf)
         /* now we can compute the new bucket's primary block number */
         start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
  
+       /* if we added a splitpoint, should match result of _hash_alloc_buckets */
+       if (firstblock != InvalidBlockNumber &&
+               firstblock != start_nblkno)
+               elog(PANIC, "unexpected hash relation size: %u, should be %u",
+                        firstblock, start_nblkno);
+
         Assert(!_hash_has_active_scan(rel, new_bucket));
  
         if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
@@ -494,6 +533,81 @@ fail:
  }
  
  
+/*
+ * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
+ *
+ * This does not need to initialize the new bucket pages; we'll do that as
+ * each one is used by _hash_expandtable().  But we have to extend the logical
+ * EOF to the end of the splitpoint; otherwise the first overflow page
+ * allocated beyond the splitpoint will represent a noncontiguous access,
+ * which can confuse md.c (and will probably be forbidden by future changes
+ * to md.c).
+ *
+ * We do this by writing a page of zeroes at the end of the splitpoint range.
+ * We expect that the filesystem will ensure that the intervening pages read
+ * as zeroes too.  On many filesystems this "hole" will not be allocated
+ * immediately, which means that the index file may end up more fragmented
+ * than if we forced it all to be allocated now; but since we don't scan
+ * hash indexes sequentially anyway, that probably doesn't matter.
+ *
+ * XXX It's annoying that this code is executed with the metapage lock held.
+ * We need to interlock against _hash_getovflpage() adding a new overflow page
+ * concurrently, but it'd likely be better to use LockRelationForExtension
+ * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
+ * so it may not be worth worrying about.
+ *
+ * Returns the first block number in the new splitpoint's range, or
+ * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ */
+static BlockNumber
+_hash_alloc_buckets(Relation rel, uint32 nblocks)
+{
+       BlockNumber     firstblock;
+       BlockNumber     lastblock;
+       BlockNumber     endblock;
+       char            zerobuf[BLCKSZ];
+
+       /*
+        * Since we hold metapage lock, no one else is either splitting or
+        * allocating a new page in _hash_getovflpage(); hence it's safe to
+        * assume that the relation length isn't changing under us.
+        */
+       firstblock = RelationGetNumberOfBlocks(rel);
+       lastblock = firstblock + nblocks - 1;
+
+       /*
+        * Check for overflow in block number calculation; if so, we cannot
+        * extend the index anymore.
+        */
+       if (lastblock < firstblock || lastblock == InvalidBlockNumber)
+               return InvalidBlockNumber;
+
+       /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+
+       MemSet(zerobuf, 0, sizeof(zerobuf));
+
+       /*
+        * XXX If the extension results in creation of new segment files,
+        * we have to make sure that each non-last file is correctly filled out to
+        * RELSEG_SIZE blocks.  This ought to be done inside mdextend, but
+        * changing the smgr API seems best left for development cycle not late
+        * beta.  Temporary fix for bug #2737.
+        */
+#ifndef LET_OS_MANAGE_FILESIZE
+       for (endblock = firstblock | (RELSEG_SIZE - 1);
+                endblock < lastblock;
+                endblock += RELSEG_SIZE)
+               smgrextend(DEFAULT_SMGR, rel, endblock, zerobuf);
+#endif
+
+       smgrextend(DEFAULT_SMGR, rel, lastblock, zerobuf);
+
+       rel->rd_nblocks = lastblock+1;
+
+       return firstblock;
+}
+
+
  /*
   * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
   *
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 19 Nov 2006 21:33:46 +0000 (21:33 +0000)
src/backend/access/hash/hashovfl.c		patch \| blob \| history
src/backend/access/hash/hashpage.c		patch \| blob \| history