* hashsearch.c
* search code for postgres hash tables
*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.26 2001/03/23 04:49:51 momjian Exp $
+ * src/backend/access/hash/hashsearch.c
*
*-------------------------------------------------------------------------
*/
-
#include "postgres.h"
#include "access/hash.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/rel.h"
-/*
- * _hash_search() -- Finds the page/bucket that the contains the
- * scankey and loads it into *bufP. the buffer has a read lock.
- */
-void
-_hash_search(Relation rel,
- int keysz,
- ScanKey scankey,
- Buffer *bufP,
- HashMetaPage metap)
-{
- BlockNumber blkno;
- Datum keyDatum;
- Bucket bucket;
-
- if (scankey == (ScanKey) NULL ||
- (keyDatum = scankey[0].sk_argument) == (Datum) NULL)
- {
-
- /*
- * If the scankey argument is NULL, all tuples will satisfy the
- * scan so we start the scan at the first bucket (bucket 0).
- */
- bucket = 0;
- }
- else
- bucket = _hash_call(rel, metap, keyDatum);
-
- blkno = BUCKET_TO_BLKNO(bucket);
-
- *bufP = _hash_getbuf(rel, blkno, HASH_READ);
-}
-
/*
* _hash_next() -- Get the next item in a scan.
*
- * On entry, we have a valid currentItemData in the scan, and a
- * read lock on the page that contains that item. We do not have
- * the page pinned. We return the next item in the scan. On
- * exit, we have the page containing the next item locked but not
- * pinned.
+ * On entry, we have a valid hashso_curpos in the scan, and a
+ * pin and read lock on the page that contains that item.
+ * We find the next item in the scan, if any.
+ * On success exit, we have the page containing the next item
+ * pinned and locked.
*/
-RetrieveIndexResult
+bool
_hash_next(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
+ Relation rel = scan->indexRelation;
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
Buffer buf;
- Buffer metabuf;
Page page;
OffsetNumber offnum;
- RetrieveIndexResult res;
ItemPointer current;
- HashItem hitem;
IndexTuple itup;
- HashScanOpaque so;
-
- rel = scan->relation;
- so = (HashScanOpaque) scan->opaque;
- current = &(scan->currentItemData);
-
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
-
- /*
- * XXX 10 may 91: somewhere there's a bug in our management of the
- * cached buffer for this scan. wei discovered it. the following is
- * a workaround so he can work until i figure out what's going on.
- */
-
- if (!BufferIsValid(so->hashso_curbuf))
- {
- so->hashso_curbuf = _hash_getbuf(rel,
- ItemPointerGetBlockNumber(current),
- HASH_READ);
- }
- /* we still have the buffer pinned and locked */
+ /* we still have the buffer pinned and read-locked */
buf = so->hashso_curbuf;
+ Assert(BufferIsValid(buf));
/*
- * step to next valid tuple. note that _hash_step releases our lock
- * on 'metabuf'; if we switch to a new 'buf' while looking for the
- * next tuple, we come back with a lock on that buffer.
+ * step to next valid tuple.
*/
- if (!_hash_step(scan, &buf, dir, metabuf))
- return (RetrieveIndexResult) NULL;
+ if (!_hash_step(scan, &buf, dir))
+ return false;
/* if we're here, _hash_step found a valid tuple */
- current = &(scan->currentItemData);
+ current = &(so->hashso_curpos);
offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
- hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &hitem->hash_itup;
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
+ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ so->hashso_heappos = itup->t_tid;
- return res;
+ return true;
}
+/*
+ * Advance to next page in a bucket, if any.
+ */
static void
_hash_readnext(Relation rel,
Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
BlockNumber blkno;
blkno = (*opaquep)->hasho_nextblkno;
- _hash_relbuf(rel, *bufp, HASH_READ);
+ _hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno))
{
- *bufp = _hash_getbuf(rel, blkno, HASH_READ);
- *pagep = BufferGetPage(*bufp);
- _hash_checkpage(*pagep, LH_OVERFLOW_PAGE);
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+ *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
- Assert(!PageIsEmpty(*pagep));
}
}
+/*
+ * Advance to previous page in a bucket, if any.
+ */
static void
_hash_readprev(Relation rel,
Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
BlockNumber blkno;
blkno = (*opaquep)->hasho_prevblkno;
- _hash_relbuf(rel, *bufp, HASH_READ);
+ _hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno))
{
- *bufp = _hash_getbuf(rel, blkno, HASH_READ);
- *pagep = BufferGetPage(*bufp);
- _hash_checkpage(*pagep, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ,
+ LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
- if (PageIsEmpty(*pagep))
- {
- Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE);
- _hash_relbuf(rel, *bufp, HASH_READ);
- *bufp = InvalidBuffer;
- }
}
}
/*
* _hash_first() -- Find the first item in a scan.
*
- * Return the RetrieveIndexResult of the first item in the tree that
- * satisfies the qualificatin associated with the scan descriptor. On
- * exit, the page containing the current index tuple is read locked
+ * Find the first item in the index that
+ * satisfies the qualification associated with the scan descriptor. On
+ * success, the page containing the current index tuple is read locked
* and pinned, and the scan's opaque data entry is updated to
* include the buffer.
*/
-RetrieveIndexResult
+bool
_hash_first(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
+ Relation rel = scan->indexRelation;
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ ScanKey cur;
+ uint32 hashkey;
+ Bucket bucket;
+ BlockNumber blkno;
+ BlockNumber oldblkno = InvalidBuffer;
+ bool retry = false;
Buffer buf;
Buffer metabuf;
Page page;
HashPageOpaque opaque;
HashMetaPage metap;
- HashItem hitem;
IndexTuple itup;
ItemPointer current;
OffsetNumber offnum;
- RetrieveIndexResult res;
- HashScanOpaque so;
- rel = scan->relation;
- so = (HashScanOpaque) scan->opaque;
- current = &(scan->currentItemData);
+ pgstat_count_index_scan(rel);
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
- metap = (HashMetaPage) BufferGetPage(metabuf);
- _hash_checkpage((Page) metap, LH_META_PAGE);
+ current = &(so->hashso_curpos);
+ ItemPointerSetInvalid(current);
/*
- * XXX -- The attribute number stored in the scan key is the attno in
- * the heap relation. We need to transmogrify this into the index
- * relation attno here. For the moment, we have hardwired attno == 1.
+ * We do not support hash scans with no index qualification, because we
+ * would have to read the whole index rather than just one bucket. That
+ * creates a whole raft of problems, since we haven't got a practical way
+ * to lock all the buckets against splits or compactions.
*/
+ if (scan->numberOfKeys < 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("hash indexes do not support whole-index scans")));
- /* find the correct bucket page and load it into buf */
- _hash_search(rel, 1, scan->keyData, &buf, metap);
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE);
- opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ /* There may be more than one index qual, but we hash only the first */
+ cur = &scan->keyData[0];
+
+ /* We support only single-column hash indexes */
+ Assert(cur->sk_attno == 1);
+ /* And there's only one operator strategy, too */
+ Assert(cur->sk_strategy == HTEqualStrategyNumber);
/*
- * if we are scanning forward, we need to find the first non-empty
- * page (if any) in the bucket chain. since overflow pages are never
- * empty, this had better be either the bucket page or the first
- * overflow page.
+ * If the constant in the index qual is NULL, assume it cannot match any
+ * items in the index.
+ */
+ if (cur->sk_flags & SK_ISNULL)
+ return false;
+
+ /*
+ * Okay to compute the hash key. We want to do this before acquiring any
+ * locks, in case a user-defined hash function happens to be slow.
+ *
+ * If scankey operator is not a cross-type comparison, we can use the
+ * cached hash function; otherwise gotta look it up in the catalogs.
*
- * if we are scanning backward, we always go all the way to the end of
- * the bucket chain.
+ * We support the convention that sk_subtype == InvalidOid means the
+ * opclass input type; this is a hack to simplify life for ScanKeyInit().
+ */
+ if (cur->sk_subtype == rel->rd_opcintype[0] ||
+ cur->sk_subtype == InvalidOid)
+ hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
+ else
+ hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
+ cur->sk_subtype);
+
+ so->hashso_sk_hash = hashkey;
+
+ /* Read the metapage */
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+ page = BufferGetPage(metabuf, NULL, NULL,
+ BGP_NO_SNAPSHOT_TEST);
+ metap = HashPageGetMeta(page);
+
+ /*
+ * Loop until we get a lock on the correct target bucket.
*/
- if (PageIsEmpty(page))
+ for (;;)
{
- if (BlockNumberIsValid(opaque->hasho_nextblkno))
- _hash_readnext(rel, &buf, &page, &opaque);
- else
+ /*
+ * Compute the target bucket number, and convert to block number.
+ */
+ bucket = _hash_hashkey2bucket(hashkey,
+ metap->hashm_maxbucket,
+ metap->hashm_highmask,
+ metap->hashm_lowmask);
+
+ blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+ /* Release metapage lock, but keep pin. */
+ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+ /*
+ * If the previous iteration of this loop locked what is still the
+ * correct target bucket, we are done. Otherwise, drop any old lock
+ * and lock what now appears to be the correct bucket.
+ */
+ if (retry)
{
- ItemPointerSetInvalid(current);
- so->hashso_curbuf = InvalidBuffer;
-
- /*
- * If there is no scankeys, all tuples will satisfy the scan -
- * so we continue in _hash_step to get tuples from all
- * buckets. - vadim 04/29/97
- */
- if (scan->numberOfKeys >= 1)
- {
- _hash_relbuf(rel, buf, HASH_READ);
- _hash_relbuf(rel, metabuf, HASH_READ);
- return (RetrieveIndexResult) NULL;
- }
+ if (oldblkno == blkno)
+ break;
+ _hash_droplock(rel, oldblkno, HASH_SHARE);
}
+ _hash_getlock(rel, blkno, HASH_SHARE);
+
+ /*
+ * Reacquire metapage lock and check that no bucket split has taken
+ * place while we were awaiting the bucket lock.
+ */
+ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+ oldblkno = blkno;
+ retry = true;
}
+
+ /* done with the metapage */
+ _hash_dropbuf(rel, metabuf);
+
+ /* Update scan opaque state to show we have lock on the bucket */
+ so->hashso_bucket = bucket;
+ so->hashso_bucket_valid = true;
+ so->hashso_bucket_blkno = blkno;
+
+ /* Fetch the primary bucket page for the bucket */
+ buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
+ page = BufferGetPage(buf, NULL, NULL,
+ BGP_NO_SNAPSHOT_TEST);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+
+ /* If a backwards scan is requested, move to the end of the chain */
if (ScanDirectionIsBackward(dir))
{
while (BlockNumberIsValid(opaque->hasho_nextblkno))
_hash_readnext(rel, &buf, &page, &opaque);
}
- if (!_hash_step(scan, &buf, dir, metabuf))
- return (RetrieveIndexResult) NULL;
+ /* Now find the first tuple satisfying the qualification */
+ if (!_hash_step(scan, &buf, dir))
+ return false;
/* if we're here, _hash_step found a valid tuple */
- current = &(scan->currentItemData);
offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
- hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &hitem->hash_itup;
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
+ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ so->hashso_heappos = itup->t_tid;
- return res;
+ return true;
}
/*
* _hash_step() -- step to the next valid item in a scan in the bucket.
*
* If no valid record exists in the requested direction, return
- * false. Else, return true and set the CurrentItemData for the
+ * false. Else, return true and set the hashso_curpos for the
* scan to the right thing.
*
- * 'bufP' points to the buffer which contains the current page
- * that we'll step through.
- *
- * 'metabuf' is released when this returns.
+ * 'bufP' points to the current buffer, which is pinned and read-locked.
+ * On success exit, we have pin and read-lock on whichever page
+ * contains the right item; on failure, we have released all buffers.
*/
bool
-_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
+_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
- Relation rel;
+ Relation rel = scan->indexRelation;
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
ItemPointer current;
- HashScanOpaque so;
- int allbuckets;
- HashMetaPage metap;
Buffer buf;
Page page;
HashPageOpaque opaque;
OffsetNumber maxoff;
OffsetNumber offnum;
- Bucket bucket;
BlockNumber blkno;
- HashItem hitem;
IndexTuple itup;
- rel = scan->relation;
- current = &(scan->currentItemData);
- so = (HashScanOpaque) scan->opaque;
- allbuckets = (scan->numberOfKeys < 1);
-
- metap = (HashMetaPage) BufferGetPage(metabuf);
- _hash_checkpage((Page) metap, LH_META_PAGE);
+ current = &(so->hashso_curpos);
buf = *bufP;
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/*
- * If _hash_step is called from _hash_first, current will not be
- * valid, so we can't dereference it. However, in that case, we
- * presumably want to start at the beginning/end of the page...
+ * If _hash_step is called from _hash_first, current will not be valid, so
+ * we can't dereference it. However, in that case, we presumably want to
+ * start at the beginning/end of the page...
*/
maxoff = PageGetMaxOffsetNumber(page);
if (ItemPointerIsValid(current))
offnum = InvalidOffsetNumber;
/*
- * 'offnum' now points to the last tuple we have seen (if any).
+ * 'offnum' now points to the last tuple we examined (if any).
*
* continue to step through tuples until: 1) we get to the end of the
* bucket chain or 2) we find a valid tuple.
*/
do
{
- bucket = opaque->hasho_bucket;
-
switch (dir)
{
case ForwardScanDirection:
if (offnum != InvalidOffsetNumber)
- {
offnum = OffsetNumberNext(offnum); /* move forward */
- }
else
{
- offnum = FirstOffsetNumber; /* new page */
+ /* new page, locate starting position by binary search */
+ offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
- while (offnum > maxoff)
+
+ for (;;)
{
+ /*
+ * check if we're still in the range of items with the
+ * target hash key
+ */
+ if (offnum <= maxoff)
+ {
+ Assert(offnum >= FirstOffsetNumber);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+ break; /* yes, so exit for-loop */
+ }
- /*--------
- * either this page is empty
- * (maxoff == InvalidOffsetNumber)
- * or we ran off the end.
- *--------
+ /*
+ * ran off the end of this page, try the next
*/
_hash_readnext(rel, &buf, &page, &opaque);
- if (BufferIsInvalid(buf))
- { /* end of chain */
- if (allbuckets && bucket < metap->hashm_maxbucket)
- {
- ++bucket;
- blkno = BUCKET_TO_BLKNO(bucket);
- buf = _hash_getbuf(rel, blkno, HASH_READ);
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE);
- opaque = (HashPageOpaque) PageGetSpecialPointer(page);
- Assert(opaque->hasho_bucket == bucket);
- while (PageIsEmpty(page) &&
- BlockNumberIsValid(opaque->hasho_nextblkno))
- _hash_readnext(rel, &buf, &page, &opaque);
- maxoff = PageGetMaxOffsetNumber(page);
- offnum = FirstOffsetNumber;
- }
- else
- {
- maxoff = offnum = InvalidOffsetNumber;
- break; /* while */
- }
+ if (BufferIsValid(buf))
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
else
{
- /* _hash_readnext never returns an empty page */
- maxoff = PageGetMaxOffsetNumber(page);
- offnum = FirstOffsetNumber;
+ /* end of bucket */
+ itup = NULL;
+ break; /* exit for-loop */
}
}
break;
+
case BackwardScanDirection:
if (offnum != InvalidOffsetNumber)
- {
offnum = OffsetNumberPrev(offnum); /* move back */
- }
else
{
- offnum = maxoff; /* new page */
+ /* new page, locate starting position by binary search */
+ offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
- while (offnum < FirstOffsetNumber)
+
+ for (;;)
{
+ /*
+ * check if we're still in the range of items with the
+ * target hash key
+ */
+ if (offnum >= FirstOffsetNumber)
+ {
+ Assert(offnum <= maxoff);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+ break; /* yes, so exit for-loop */
+ }
- /*---------
- * either this page is empty
- * (offnum == InvalidOffsetNumber)
- * or we ran off the end.
- *---------
+ /*
+ * ran off the end of this page, try the next
*/
_hash_readprev(rel, &buf, &page, &opaque);
- if (BufferIsInvalid(buf))
- { /* end of chain */
- if (allbuckets && bucket > 0)
- {
- --bucket;
- blkno = BUCKET_TO_BLKNO(bucket);
- buf = _hash_getbuf(rel, blkno, HASH_READ);
- page = BufferGetPage(buf);
- _hash_checkpage(page, LH_BUCKET_PAGE);
- opaque = (HashPageOpaque) PageGetSpecialPointer(page);
- Assert(opaque->hasho_bucket == bucket);
- while (BlockNumberIsValid(opaque->hasho_nextblkno))
- _hash_readnext(rel, &buf, &page, &opaque);
- maxoff = offnum = PageGetMaxOffsetNumber(page);
- }
- else
- {
- maxoff = offnum = InvalidOffsetNumber;
- break; /* while */
- }
+ if (BufferIsValid(buf))
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
else
{
- /* _hash_readprev never returns an empty page */
- maxoff = offnum = PageGetMaxOffsetNumber(page);
+ /* end of bucket */
+ itup = NULL;
+ break; /* exit for-loop */
}
}
break;
+
default:
/* NoMovementScanDirection */
/* this should not be reached */
+ itup = NULL;
break;
}
- /* we ran off the end of the world without finding a match */
- if (offnum == InvalidOffsetNumber)
+ if (itup == NULL)
{
- _hash_relbuf(rel, metabuf, HASH_READ);
+ /* we ran off the end of the bucket without finding a match */
*bufP = so->hashso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
return false;
}
- /* get ready to check this tuple */
- hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &hitem->hash_itup;
+ /* check the tuple quals, loop around if not met */
} while (!_hash_checkqual(scan, itup));
/* if we made it to here, we've found a valid tuple */
- _hash_relbuf(rel, metabuf, HASH_READ);
blkno = BufferGetBlockNumber(buf);
*bufP = so->hashso_curbuf = buf;
ItemPointerSet(current, blkno, offnum);