pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
/*
- * Set hasho_prevblkno with current hashm_maxbucket. This value will
- * be used to validate cached HashMetaPageData. See
+ * Set hasho_prevblkno with current hashm_maxbucket. This value will be
+ * used to validate cached HashMetaPageData. See
* _hash_getbucketbuf_from_hashkey().
*/
pageopaque->hasho_prevblkno = max_bucket;
{
/* release pin we hold on primary bucket page */
if (BufferIsValid(so->hashso_bucket_buf) &&
- so->hashso_bucket_buf != so->hashso_curbuf)
+ so->hashso_bucket_buf != so->currPos.buf)
_hash_dropbuf(rel, so->hashso_bucket_buf);
so->hashso_bucket_buf = InvalidBuffer;
/* release pin we hold on primary bucket page of bucket being split */
if (BufferIsValid(so->hashso_split_bucket_buf) &&
- so->hashso_split_bucket_buf != so->hashso_curbuf)
+ so->hashso_split_bucket_buf != so->currPos.buf)
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
so->hashso_split_bucket_buf = InvalidBuffer;
/* release any pin we still hold */
- if (BufferIsValid(so->hashso_curbuf))
- _hash_dropbuf(rel, so->hashso_curbuf);
- so->hashso_curbuf = InvalidBuffer;
+ if (BufferIsValid(so->currPos.buf))
+ _hash_dropbuf(rel, so->currPos.buf);
+ so->currPos.buf = InvalidBuffer;
/* reset split scan */
so->hashso_buc_populated = false;
int32 ffactor;
uint32 num_buckets;
uint32 i;
+ bool use_wal;
/* safety check */
if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
RelationGetRelationName(rel));
+ /*
+ * WAL log creation of pages if the relation is persistent, or this is the
+ * init fork. Init forks for unlogged relations always need to be WAL
+ * logged.
+ */
+ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
+
/*
* Determine the target fill factor (in tuples per bucket) for this index.
* The idea is to make the fill factor correspond to pages about as full
if (ffactor < 10)
ffactor = 10;
- procid = index_getprocid(rel, 1, HASHPROC);
+ procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
/*
* We initialize the metapage, the first N bucket pages, and the first
metap = HashPageGetMeta(pg);
/* XLOG stuff */
- if (RelationNeedsWAL(rel))
+ if (use_wal)
{
xl_hash_init_meta_page xlrec;
XLogRecPtr recptr;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
- XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT);
+ XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
_hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
MarkBufferDirty(buf);
- log_newpage(&rel->rd_node,
- forkNum,
- blkno,
- BufferGetPage(buf),
- true);
+ if (use_wal)
+ log_newpage(&rel->rd_node,
+ forkNum,
+ blkno,
+ BufferGetPage(buf),
+ true);
_hash_relbuf(rel, buf);
}
MarkBufferDirty(metabuf);
/* XLOG stuff */
- if (RelationNeedsWAL(rel))
+ if (use_wal)
{
xl_hash_init_bitmap_page xlrec;
XLogRecPtr recptr;
* Choose the number of initial bucket pages to match the fill factor
* given the estimated number of tuples. We round up the result to the
* total number of buckets which has to be allocated before using its
- * _hashm_spare element. However always force at least 2 bucket pages.
- * The upper limit is determined by considerations explained in
+ * _hashm_spare element. However always force at least 2 bucket pages. The
+ * upper limit is determined by considerations explained in
* _hash_expandtable().
*/
dnumbuckets = num_tuples / ffactor;
metap->hashm_maxbucket = num_buckets - 1;
/*
- * Set highmask as next immediate ((2 ^ x) - 1), which should be sufficient
- * to cover num_buckets.
+ * Set highmask as next immediate ((2 ^ x) - 1), which should be
+ * sufficient to cover num_buckets.
*/
metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
metap->hashm_lowmask = (metap->hashm_highmask >> 1);
metap->hashm_firstfree = 0;
/*
- * Set pd_lower just past the end of the metadata. This is to log full
- * page image of metapage in xloginsert.c.
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
*/
((PageHeader) page)->pd_lower =
((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
{
/*
* Copy bucket mapping info now; refer to the comment in code below
- * where we copy this information before calling _hash_splitbucket
- * to see why this is okay.
+ * where we copy this information before calling _hash_splitbucket to
+ * see why this is okay.
*/
maxbucket = metap->hashm_maxbucket;
highmask = metap->hashm_highmask;
* We treat allocation of buckets as a separate WAL-logged action.
* Even if we fail after this operation, won't leak bucket pages;
* rather, the next split will consume this space. In any case, even
- * without failure we don't use all the space in one split
- * operation.
+ * without failure we don't use all the space in one split operation.
*/
buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
/*
* Mark the old bucket to indicate that split is in progress. (At
- * operation end, we will clear the split-in-progress flag.) Also,
- * for a primary bucket page, hasho_prevblkno stores the number of
- * buckets that existed as of the last split, so we must update that
- * value here.
+ * operation end, we will clear the split-in-progress flag.) Also, for a
+ * primary bucket page, hasho_prevblkno stores the number of buckets that
+ * existed as of the last split, so we must update that value here.
*/
oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
oopaque->hasho_prevblkno = maxbucket;
XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
sizeof(uint32));
XLogRegisterBufData(2,
- (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
+ (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
sizeof(uint32));
}
buf_oblkno, buf_nblkno, NULL,
maxbucket, highmask, lowmask);
- /* all done, now release the locks and pins on primary buckets. */
- _hash_relbuf(rel, buf_oblkno);
- _hash_relbuf(rel, buf_nblkno);
+ /* all done, now release the pins on primary buckets. */
+ _hash_dropbuf(rel, buf_oblkno);
+ _hash_dropbuf(rel, buf_nblkno);
return;
BlockNumber lastblock;
char zerobuf[BLCKSZ];
Page page;
+ HashPageOpaque ovflopaque;
lastblock = firstblock + nblocks - 1;
/*
* Initialize the page. Just zeroing the page won't work; see
- * _hash_freeovflpage for similar usage.
+ * _hash_freeovflpage for similar usage. We take care to make the special
+ * space valid for the benefit of tools such as pageinspect.
*/
_hash_pageinit(page, BLCKSZ);
+ ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ ovflopaque->hasho_prevblkno = InvalidBlockNumber;
+ ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+ ovflopaque->hasho_bucket = -1;
+ ovflopaque->hasho_flag = LH_UNUSED_PAGE;
+ ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
if (RelationNeedsWAL(rel))
log_newpage(&rel->rd_node,
MAIN_FORKNUM,
* while a split is in progress.
*
* In addition, the caller must have created the new bucket's base page,
- * which is passed in buffer nbuf, pinned and write-locked. That lock and
- * pin are released here. (The API is set up this way because we must do
- * _hash_getnewbuf() before releasing the metapage write lock. So instead of
- * passing the new bucket's start block number, we pass an actual buffer.)
+ * which is passed in buffer nbuf, pinned and write-locked. The lock will be
+ * released here and pin must be released by the caller. (The API is set up
+ * this way because we must do _hash_getnewbuf() before releasing the metapage
+ * write lock. So instead of passing the new bucket's start block number, we
+ * pass an actual buffer.)
*/
static void
_hash_splitbucket(Relation rel,
/*
* After the split is finished, mark the old bucket to indicate that it
- * contains deletable tuples. Vacuum will clear split-cleanup flag after
- * deleting such tuples.
+ * contains deletable tuples. We will clear split-cleanup flag after
+ * deleting such tuples either at the end of split or at the next split
+ * from old bucket or at the time of vacuum.
*/
oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
}
END_CRIT_SECTION();
+
+ /*
+ * If possible, clean up the old bucket. We might not be able to do this
+ * if someone else has a pin on it, but if not then we can go ahead. This
+ * isn't absolutely necessary, but it reduces bloat; if we don't do it
+ * now, VACUUM will do it eventually, but maybe not until new overflow
+ * pages have been allocated. Note that there's no need to clean up the
+ * new bucket.
+ */
+ if (IsBufferCleanupOK(bucket_obuf))
+ {
+ LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
+ hashbucketcleanup(rel, obucket, bucket_obuf,
+ BufferGetBlockNumber(bucket_obuf), NULL,
+ maxbucket, highmask, lowmask, NULL, NULL, true,
+ NULL, NULL);
+ }
+ else
+ {
+ LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
+ }
}
/*
nbucket, obuf, bucket_nbuf, tidhtab,
maxbucket, highmask, lowmask);
- _hash_relbuf(rel, bucket_nbuf);
- LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+ _hash_dropbuf(rel, bucket_nbuf);
hash_destroy(tidhtab);
}
* _hash_getcachedmetap() -- Returns cached metapage data.
*
* If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
- * the metapage. If not set, we'll set it before returning if we have to
- * refresh the cache, and return with a pin but no lock on it; caller is
- * responsible for releasing the pin.
+ * the metapage. If not set, we'll set it before returning if we have to
+ * refresh the cache, and return with a pin but no lock on it; caller is
+ * responsible for releasing the pin.
*
- * We refresh the cache if it's not initialized yet or force_refresh is true.
+ * We refresh the cache if it's not initialized yet or force_refresh is true.
*/
HashMetaPage
_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Assert(metabuf);
if (force_refresh || rel->rd_amcache == NULL)
{
- char *cache = NULL;
+ char *cache = NULL;
/*
- * It's important that we don't set rd_amcache to an invalid
- * value. Either MemoryContextAlloc or _hash_getbuf could fail,
- * so don't install a pointer to the newly-allocated storage in the
- * actual relcache entry until both have succeeeded.
+ * It's important that we don't set rd_amcache to an invalid value.
+ * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
+ * install a pointer to the newly-allocated storage in the actual
+ * relcache entry until both have succeeeded.
*/
if (rel->rd_amcache == NULL)
cache = MemoryContextAlloc(rel->rd_indexcxt,
* us an opportunity to use the previously saved metapage contents to reach
* the target bucket buffer, instead of reading from the metapage every time.
* This saves one buffer access every time we want to reach the target bucket
- * buffer, which is very helpful savings in bufmgr traffic and contention.
+ * buffer, which is very helpful savings in bufmgr traffic and contention.
*
* The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
* bucket buffer has to be locked for reading or writing.
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
+ Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
/*
* If this bucket hasn't been split, we're done.
- *
- * NB: The check for InvalidBlockNumber is only needed for on-disk
- * compatibility with indexes created before we started storing
- * hashm_maxbucket in the primary page's hasho_prevblkno.
*/
- if (opaque->hasho_prevblkno == InvalidBlockNumber ||
- opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+ if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
break;
/* Drop lock on this buffer, update cached metapage, and retry. */