+ START_CRIT_SECTION();
+
+ oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
+ nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
+
+ /*
+ * After the split is finished, mark the old bucket to indicate that it
+ * contains deletable tuples. We will clear split-cleanup flag after
+ * deleting such tuples either at the end of split or at the next split
+ * from old bucket or at the time of vacuum.
+ */
+ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+
+ /*
+ * now write the buffers, here we don't release the locks as caller is
+ * responsible to release locks.
+ */
+ MarkBufferDirty(bucket_obuf);
+ MarkBufferDirty(bucket_nbuf);
+
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+ xl_hash_split_complete xlrec;
+
+ xlrec.old_bucket_flag = oopaque->hasho_flag;
+ xlrec.new_bucket_flag = nopaque->hasho_flag;
+
+ XLogBeginInsert();
+
+ XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
+
+ XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
+
+ PageSetLSN(BufferGetPage(bucket_obuf), recptr);
+ PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /*
+ * If possible, clean up the old bucket. We might not be able to do this
+ * if someone else has a pin on it, but if not then we can go ahead. This
+ * isn't absolutely necessary, but it reduces bloat; if we don't do it
+ * now, VACUUM will do it eventually, but maybe not until new overflow
+ * pages have been allocated. Note that there's no need to clean up the
+ * new bucket.
+ */
+ if (IsBufferCleanupOK(bucket_obuf))
+ {
+ LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
+ hashbucketcleanup(rel, obucket, bucket_obuf,
+ BufferGetBlockNumber(bucket_obuf), NULL,
+ maxbucket, highmask, lowmask, NULL, NULL, true,
+ NULL, NULL);
+ }
+ else
+ {
+ LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
+ }
+}
+
+/*
+ * _hash_finish_split() -- Finish the previously interrupted split operation
+ *
+ * To complete the split operation, we form the hash table of TIDs in new
+ * bucket which is then used by split operation to skip tuples that are
+ * already moved before the split operation was previously interrupted.
+ *
+ * The caller must hold a pin, but no lock, on the metapage and old bucket's
+ * primary page buffer. The buffers are returned in the same state. (The
+ * metapage is only touched if it becomes necessary to add or remove overflow
+ * pages.)
+ */
+void
+_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
+ uint32 maxbucket, uint32 highmask, uint32 lowmask)
+{
+ HASHCTL hash_ctl;
+ HTAB *tidhtab;
+ Buffer bucket_nbuf = InvalidBuffer;
+ Buffer nbuf;
+ Page npage;
+ BlockNumber nblkno;
+ BlockNumber bucket_nblkno;
+ HashPageOpaque npageopaque;
+ Bucket nbucket;
+ bool found;
+
+ /* Initialize hash tables used to track TIDs */
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(ItemPointerData);
+ hash_ctl.entrysize = sizeof(ItemPointerData);
+ hash_ctl.hcxt = CurrentMemoryContext;
+
+ tidhtab =
+ hash_create("bucket ctids",
+ 256, /* arbitrary initial size */
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
+
+ /*
+ * Scan the new bucket and build hash table of TIDs
+ */
+ for (;;)
+ {
+ OffsetNumber noffnum;
+ OffsetNumber nmaxoffnum;
+
+ nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
+ LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+
+ /* remember the primary bucket buffer to acquire cleanup lock on it. */
+ if (nblkno == bucket_nblkno)
+ bucket_nbuf = nbuf;
+
+ npage = BufferGetPage(nbuf);
+ npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+
+ /* Scan each tuple in new page */
+ nmaxoffnum = PageGetMaxOffsetNumber(npage);
+ for (noffnum = FirstOffsetNumber;
+ noffnum <= nmaxoffnum;
+ noffnum = OffsetNumberNext(noffnum))
+ {
+ IndexTuple itup;
+
+ /* Fetch the item's TID and insert it in hash table. */
+ itup = (IndexTuple) PageGetItem(npage,
+ PageGetItemId(npage, noffnum));
+
+ (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
+
+ Assert(!found);
+ }
+
+ nblkno = npageopaque->hasho_nextblkno;
+
+ /*
+ * release our write lock without modifying buffer and ensure to
+ * retain the pin on primary bucket.
+ */
+ if (nbuf == bucket_nbuf)
+ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
+ else
+ _hash_relbuf(rel, nbuf);
+
+ /* Exit loop if no more overflow pages in new bucket */
+ if (!BlockNumberIsValid(nblkno))
+ break;
+ }
+
+ /*
+ * Conditionally get the cleanup lock on old and new buckets to perform
+ * the split operation. If we don't get the cleanup locks, silently give
+ * up and next insertion on old bucket will try again to complete the
+ * split.
+ */
+ if (!ConditionalLockBufferForCleanup(obuf))
+ {
+ hash_destroy(tidhtab);
+ return;
+ }
+ if (!ConditionalLockBufferForCleanup(bucket_nbuf))
+ {
+ LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+ hash_destroy(tidhtab);
+ return;
+ }
+
+ npage = BufferGetPage(bucket_nbuf);
+ npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+ nbucket = npageopaque->hasho_bucket;
+
+ _hash_splitbucket(rel, metabuf, obucket,
+ nbucket, obuf, bucket_nbuf, tidhtab,
+ maxbucket, highmask, lowmask);
+
+ _hash_dropbuf(rel, bucket_nbuf);
+ hash_destroy(tidhtab);
+}
+
+/*
+ * log_split_page() -- Log the split operation
+ *
+ * We log the split operation when the new page in new bucket gets full,
+ * so we log the entire page.
+ *
+ * 'buf' must be locked by the caller which is also responsible for unlocking
+ * it.
+ */
+static void
+log_split_page(Relation rel, Buffer buf)
+{
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
+
+ PageSetLSN(BufferGetPage(buf), recptr);
+ }
+}
+
+/*
+ * _hash_getcachedmetap() -- Returns cached metapage data.
+ *
+ * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
+ * the metapage. If not set, we'll set it before returning if we have to
+ * refresh the cache, and return with a pin but no lock on it; caller is
+ * responsible for releasing the pin.
+ *
+ * We refresh the cache if it's not initialized yet or force_refresh is true.
+ */
+HashMetaPage
+_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
+{
+ Page page;
+
+ Assert(metabuf);
+ if (force_refresh || rel->rd_amcache == NULL)
+ {
+ char *cache = NULL;
+
+ /*
+ * It's important that we don't set rd_amcache to an invalid value.
+ * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
+ * install a pointer to the newly-allocated storage in the actual
+ * relcache entry until both have succeeeded.
+ */
+ if (rel->rd_amcache == NULL)
+ cache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(HashMetaPageData));
+
+ /* Read the metapage. */
+ if (BufferIsValid(*metabuf))
+ LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
+ else
+ *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
+ LH_META_PAGE);
+ page = BufferGetPage(*metabuf);
+
+ /* Populate the cache. */
+ if (rel->rd_amcache == NULL)
+ rel->rd_amcache = cache;
+ memcpy(rel->rd_amcache, HashPageGetMeta(page),
+ sizeof(HashMetaPageData));
+
+ /* Release metapage lock, but keep the pin. */
+ LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ return (HashMetaPage) rel->rd_amcache;
+}
+
+/*
+ * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ * hashkey.
+ *
+ * Bucket pages do not move or get removed once they are allocated. This give
+ * us an opportunity to use the previously saved metapage contents to reach
+ * the target bucket buffer, instead of reading from the metapage every time.
+ * This saves one buffer access every time we want to reach the target bucket
+ * buffer, which is very helpful savings in bufmgr traffic and contention.
+ *
+ * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ * bucket buffer has to be locked for reading or writing.
+ *
+ * The out parameter cachedmetap is set with metapage contents used for
+ * hashkey to bucket buffer mapping. Some callers need this info to reach the
+ * old bucket in case of bucket split, see _hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
+ HashMetaPage *cachedmetap)
+{
+ HashMetaPage metap;
+ Buffer buf;
+ Buffer metabuf = InvalidBuffer;
+ Page page;
+ Bucket bucket;
+ BlockNumber blkno;
+ HashPageOpaque opaque;
+
+ /* We read from target bucket buffer, hence locking is must. */
+ Assert(access == HASH_READ || access == HASH_WRITE);
+
+ metap = _hash_getcachedmetap(rel, &metabuf, false);
+ Assert(metap != NULL);
+
+ /*
+ * Loop until we get a lock on the correct target bucket.
+ */
+ for (;;)
+ {
+ /*
+ * Compute the target bucket number, and convert to block number.
+ */
+ bucket = _hash_hashkey2bucket(hashkey,
+ metap->hashm_maxbucket,
+ metap->hashm_highmask,
+ metap->hashm_lowmask);
+
+ blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+ /* Fetch the primary bucket page for the bucket */
+ buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+ page = BufferGetPage(buf);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+ Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
+
+ /*
+ * If this bucket hasn't been split, we're done.
+ */
+ if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+ break;
+
+ /* Drop lock on this buffer, update cached metapage, and retry. */
+ _hash_relbuf(rel, buf);
+ metap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(metap != NULL);
+ }
+
+ if (BufferIsValid(metabuf))
+ _hash_dropbuf(rel, metabuf);
+
+ if (cachedmetap)
+ *cachedmetap = metap;
+
+ return buf;