Fix bogus concurrent use of _hash_getnewbuf() in bucket split code.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index 46c6c9666e5e846daac7074baa0b0501c8ae3e22..9a779454a0cd8e2b78a8b24271bd77c054f7ea1e 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -37,6 +37,7 @@
  static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
                                         uint32 nblocks);
  static void _hash_splitbucket(Relation rel, Buffer metabuf,
+                                 Buffer nbuf,
                                   Bucket obucket, Bucket nbucket,
                                   BlockNumber start_oblkno,
                                   BlockNumber start_nblkno,
@@ -176,7 +177,9 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
   *             EOF but before updating the metapage to reflect the added page.)
   *
   *             It is caller's responsibility to ensure that only one process can
- *             extend the index at a time.
+ *             extend the index at a time.  In practice, this function is called
+ *             only while holding write lock on the metapage, because adding a page
+ *             is always associated with an update of metapage data.
   */
  Buffer
  _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
@@ -503,6 +506,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
         uint32          spare_ndx;
         BlockNumber start_oblkno;
         BlockNumber start_nblkno;
+       Buffer          buf_nblkno;
         uint32          maxbucket;
         uint32          highmask;
         uint32          lowmask;
@@ -603,6 +607,13 @@ _hash_expandtable(Relation rel, Buffer metabuf)
                 }
         }
  
+       /*
+        * Physically allocate the new bucket's primary page.  We want to do this
+        * before changing the metapage's mapping info, in case we can't get the
+        * disk space.
+        */
+       buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
+
         /*
          * Okay to proceed with split.  Update the metapage bucket mapping info.
          *
@@ -653,7 +664,8 @@ _hash_expandtable(Relation rel, Buffer metabuf)
         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
  
         /* Relocate records to the new bucket */
-       _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
+       _hash_splitbucket(rel, metabuf, buf_nblkno,
+                                         old_bucket, new_bucket,
                                           start_oblkno, start_nblkno,
                                           maxbucket, highmask, lowmask);
  
@@ -733,10 +745,16 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
   * The caller must hold a pin, but no lock, on the metapage buffer.
   * The buffer is returned in the same state.  (The metapage is only
   * touched if it becomes necessary to add or remove overflow pages.)
+ *
+ * In addition, the caller must have created the new bucket's base page,
+ * which is passed in buffer nbuf, pinned and write-locked.  The lock
+ * and pin are released here.  (The API is set up this way because we must
+ * do _hash_getnewbuf() before releasing the metapage write lock.)
   */
  static void
  _hash_splitbucket(Relation rel,
                                   Buffer metabuf,
+                                 Buffer nbuf,
                                   Bucket obucket,
                                   Bucket nbucket,
                                   BlockNumber start_oblkno,
@@ -748,7 +766,6 @@ _hash_splitbucket(Relation rel,
         BlockNumber oblkno;
         BlockNumber nblkno;
         Buffer          obuf;
-       Buffer          nbuf;
         Page            opage;
         Page            npage;
         HashPageOpaque oopaque;
@@ -765,7 +782,7 @@ _hash_splitbucket(Relation rel,
         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
  
         nblkno = start_nblkno;
-       nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
+       Assert(nblkno == BufferGetBlockNumber(nbuf));
         npage = BufferGetPage(nbuf);
  
         /* initialize the new bucket's primary page */
@@ -814,6 +831,11 @@ _hash_splitbucket(Relation rel,
                                  * insert the tuple into the new bucket.  if it doesn't fit on
                                  * the current page in the new bucket, we must allocate a new
                                  * overflow page and place the tuple on that page instead.
+                                *
+                                * XXX we have a problem here if we fail to get space for a
+                                * new overflow page: we'll error out leaving the bucket split
+                                * only partially complete, meaning the index is corrupt,
+                                * since searches may fail to find entries they should find.
                                  */
                                 itemsz = IndexTupleDSize(*itup);
                                 itemsz = MAXALIGN(itemsz);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 30 Mar 2015 20:40:05 +0000 (16:40 -0400)