]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/hash/hashsearch.c
Modify BufferGetPage() to prepare for "snapshot too old" feature
[postgresql] / src / backend / access / hash / hashsearch.c
index 80ca30efd159c8fc88ecee8e9c6170868b1adb30..dd1f464e53addfa00e02a00615f2cfa3c07bc898 100644 (file)
 /*-------------------------------------------------------------------------
  *
- * hashsearch.c--
- *    search code for postgres hash tables
+ * hashsearch.c
+ *       search code for postgres hash tables
  *
- * Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.3 1996/10/20 08:31:49 scrappy Exp $
+ *       src/backend/access/hash/hashsearch.c
  *
  *-------------------------------------------------------------------------
  */
-
 #include "postgres.h"
-#include "catalog/pg_attribute.h"
-#include "access/attnum.h"
-#include "nodes/pg_list.h" 
-#include "access/tupdesc.h"
-#include "storage/fd.h"
-#include "catalog/pg_am.h" 
-#include "catalog/pg_class.h"
-#include "nodes/nodes.h"
-#include "rewrite/prs2lock.h"
-#include "access/skey.h"
-#include "access/strat.h"
-#include "utils/rel.h"
-#include "storage/block.h"  
-#include "storage/off.h"
-#include "storage/itemptr.h"
-#include <time.h>
-#include "utils/nabstime.h"
-#include "access/htup.h"
-#include "access/itup.h"   
-#include "storage/itemid.h"
-#include "storage/item.h"
-#include "storage/buf.h"   
-#include "storage/bufpage.h" 
-#include "access/sdir.h"
-#include "access/funcindex.h"
-#include "utils/tqual.h"
-#include "storage/buf.h" 
-#include "access/relscan.h"
+
 #include "access/hash.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/rel.h"
 
-/*
- *  _hash_search() -- Finds the page/bucket that the contains the
- *  scankey and loads it into *bufP.  the buffer has a read lock.
- */
-void
-_hash_search(Relation rel,
-            int keysz,
-            ScanKey scankey,
-            Buffer *bufP,
-            HashMetaPage metap)
-{
-    BlockNumber blkno;
-    Datum keyDatum;
-    Bucket bucket;
-
-    if (scankey == (ScanKey) NULL ||
-       (keyDatum = scankey[0].sk_argument) == (Datum) NULL) {
-       /* 
-        * If the scankey argument is NULL, all tuples will satisfy
-        * the scan so we start the scan at the first bucket (bucket
-        * 0).
-        */
-       bucket = 0;
-    } else {
-       bucket = _hash_call(rel, metap, keyDatum);
-    }
-
-    blkno = BUCKET_TO_BLKNO(bucket);
-    
-    *bufP = _hash_getbuf(rel, blkno, HASH_READ);
-}
 
 /*
- *  _hash_next() -- Get the next item in a scan.
+ *     _hash_next() -- Get the next item in a scan.
  *
- *     On entry, we have a valid currentItemData in the scan, and a
- *     read lock on the page that contains that item.  We do not have
- *     the page pinned.  We return the next item in the scan.  On
- *     exit, we have the page containing the next item locked but not
- *     pinned.
+ *             On entry, we have a valid hashso_curpos in the scan, and a
+ *             pin and read lock on the page that contains that item.
+ *             We find the next item in the scan, if any.
+ *             On success exit, we have the page containing the next item
+ *             pinned and locked.
  */
-RetrieveIndexResult
+bool
 _hash_next(IndexScanDesc scan, ScanDirection dir)
 {
-    Relation rel;
-    Buffer buf;
-    Buffer metabuf;
-    Page page;
-    OffsetNumber offnum;
-    RetrieveIndexResult res;
-    ItemPointer current;
-    ItemPointer iptr;
-    HashItem hitem;
-    IndexTuple itup;
-    HashScanOpaque so;
-
-    rel = scan->relation;
-    so = (HashScanOpaque) scan->opaque; 
-    current = &(scan->currentItemData);
-
-    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
-
-    /*
-     *  XXX 10 may 91:  somewhere there's a bug in our management of the
-     *  cached buffer for this scan.  wei discovered it.  the following
-     *  is a workaround so he can work until i figure out what's going on.
-     */
-
-    if (!BufferIsValid(so->hashso_curbuf)) {
-       so->hashso_curbuf = _hash_getbuf(rel,
-                                        ItemPointerGetBlockNumber(current),
-                                        HASH_READ);
-    }
-
-    /* we still have the buffer pinned and locked */
-    buf = so->hashso_curbuf;
-
-    /*
-     * step to next valid tuple.  note that _hash_step releases our
-     * lock on 'metabuf'; if we switch to a new 'buf' while looking
-     * for the next tuple, we come back with a lock on that buffer.
-     */
-    if (!_hash_step(scan, &buf, dir, metabuf)) {
-       return ((RetrieveIndexResult) NULL);
-    }
-
-    /* if we're here, _hash_step found a valid tuple */
-    current = &(scan->currentItemData);
-    offnum = ItemPointerGetOffsetNumber(current);
-    page = BufferGetPage(buf);
-    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
-    hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-    itup = &hitem->hash_itup;
-    iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
-    memmove((char *) iptr, (char *) &(itup->t_tid),  sizeof(ItemPointerData));
-    res = FormRetrieveIndexResult(current, iptr);
-
-    return (res);
+       Relation        rel = scan->indexRelation;
+       HashScanOpaque so = (HashScanOpaque) scan->opaque;
+       Buffer          buf;
+       Page            page;
+       OffsetNumber offnum;
+       ItemPointer current;
+       IndexTuple      itup;
+
+       /* we still have the buffer pinned and read-locked */
+       buf = so->hashso_curbuf;
+       Assert(BufferIsValid(buf));
+
+       /*
+        * step to next valid tuple.
+        */
+       if (!_hash_step(scan, &buf, dir))
+               return false;
+
+       /* if we're here, _hash_step found a valid tuple */
+       current = &(so->hashso_curpos);
+       offnum = ItemPointerGetOffsetNumber(current);
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+       so->hashso_heappos = itup->t_tid;
+
+       return true;
 }
 
+/*
+ * Advance to next page in a bucket, if any.
+ */
 static void
 _hash_readnext(Relation rel,
-              Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+                          Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
 {
-    BlockNumber blkno;
-
-    blkno = (*opaquep)->hasho_nextblkno;
-    _hash_relbuf(rel, *bufp, HASH_READ);
-    *bufp = InvalidBuffer;
-    if (BlockNumberIsValid(blkno)) {
-       *bufp = _hash_getbuf(rel, blkno, HASH_READ);
-       *pagep = BufferGetPage(*bufp);
-       _hash_checkpage(*pagep, LH_OVERFLOW_PAGE);
-       *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
-       Assert(!PageIsEmpty(*pagep));
-    }
+       BlockNumber blkno;
+
+       blkno = (*opaquep)->hasho_nextblkno;
+       _hash_relbuf(rel, *bufp);
+       *bufp = InvalidBuffer;
+       /* check for interrupts while we're not holding any buffer lock */
+       CHECK_FOR_INTERRUPTS();
+       if (BlockNumberIsValid(blkno))
+       {
+               *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+               *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+               *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+       }
 }
 
+/*
+ * Advance to previous page in a bucket, if any.
+ */
 static void
 _hash_readprev(Relation rel,
-              Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+                          Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
 {
-    BlockNumber blkno;
-
-    blkno = (*opaquep)->hasho_prevblkno;
-    _hash_relbuf(rel, *bufp, HASH_READ);
-    *bufp = InvalidBuffer;
-    if (BlockNumberIsValid(blkno)) {
-       *bufp = _hash_getbuf(rel, blkno, HASH_READ);
-       *pagep = BufferGetPage(*bufp);
-       _hash_checkpage(*pagep, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
-       *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
-       if (PageIsEmpty(*pagep)) {
-           Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE);
-           _hash_relbuf(rel, *bufp, HASH_READ);
-           *bufp = InvalidBuffer;
+       BlockNumber blkno;
+
+       blkno = (*opaquep)->hasho_prevblkno;
+       _hash_relbuf(rel, *bufp);
+       *bufp = InvalidBuffer;
+       /* check for interrupts while we're not holding any buffer lock */
+       CHECK_FOR_INTERRUPTS();
+       if (BlockNumberIsValid(blkno))
+       {
+               *bufp = _hash_getbuf(rel, blkno, HASH_READ,
+                                                        LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+               *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+               *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
        }
-    }
 }
 
 /*
- *  _hash_first() -- Find the first item in a scan.
+ *     _hash_first() -- Find the first item in a scan.
  *
- *     Return the RetrieveIndexResult of the first item in the tree that
- *     satisfies the qualificatin associated with the scan descriptor. On
- *     exit, the page containing the current index tuple is read locked
- *     and pinned, and the scan's opaque data entry is updated to 
- *     include the buffer.  
+ *             Find the first item in the index that
+ *             satisfies the qualification associated with the scan descriptor. On
+ *             success, the page containing the current index tuple is read locked
+ *             and pinned, and the scan's opaque data entry is updated to
+ *             include the buffer.
  */
-RetrieveIndexResult
+bool
 _hash_first(IndexScanDesc scan, ScanDirection dir)
 {
-    Relation rel;
-    Buffer buf;
-    Buffer metabuf;
-    Page page;
-    HashPageOpaque opaque;
-    HashMetaPage metap;
-    HashItem hitem;
-    IndexTuple itup;
-    ItemPointer current;
-    ItemPointer iptr;
-    OffsetNumber offnum;
-    RetrieveIndexResult res;
-    HashScanOpaque so;
-
-    rel = scan->relation;
-    so = (HashScanOpaque) scan->opaque;
-    current = &(scan->currentItemData);
-
-    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
-    metap = (HashMetaPage) BufferGetPage(metabuf);
-    _hash_checkpage((Page) metap, LH_META_PAGE);
-
-    /*
-     *  XXX -- The attribute number stored in the scan key is the attno
-     *        in the heap relation.  We need to transmogrify this into
-     *         the index relation attno here.  For the moment, we have
-     *        hardwired attno == 1.
-     */
-
-    /* find the correct bucket page and load it into buf */
-    _hash_search(rel, 1, scan->keyData, &buf, metap);
-    page = BufferGetPage(buf);
-    _hash_checkpage(page, LH_BUCKET_PAGE);
-    opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-
-    /*
-     * if we are scanning forward, we need to find the first non-empty
-     * page (if any) in the bucket chain.  since overflow pages are
-     * never empty, this had better be either the bucket page or the
-     * first overflow page.
-     *
-     * if we are scanning backward, we always go all the way to the
-     * end of the bucket chain.
-     */
-    if (PageIsEmpty(page)) {
-       if (BlockNumberIsValid(opaque->hasho_nextblkno)) {
-           _hash_readnext(rel, &buf, &page, &opaque);
-       } else {
-           ItemPointerSetInvalid(current);
-           so->hashso_curbuf = InvalidBuffer;
-           return ((RetrieveIndexResult) NULL);
+       Relation        rel = scan->indexRelation;
+       HashScanOpaque so = (HashScanOpaque) scan->opaque;
+       ScanKey         cur;
+       uint32          hashkey;
+       Bucket          bucket;
+       BlockNumber blkno;
+       BlockNumber oldblkno = InvalidBuffer;
+       bool            retry = false;
+       Buffer          buf;
+       Buffer          metabuf;
+       Page            page;
+       HashPageOpaque opaque;
+       HashMetaPage metap;
+       IndexTuple      itup;
+       ItemPointer current;
+       OffsetNumber offnum;
+
+       pgstat_count_index_scan(rel);
+
+       current = &(so->hashso_curpos);
+       ItemPointerSetInvalid(current);
+
+       /*
+        * We do not support hash scans with no index qualification, because we
+        * would have to read the whole index rather than just one bucket. That
+        * creates a whole raft of problems, since we haven't got a practical way
+        * to lock all the buckets against splits or compactions.
+        */
+       if (scan->numberOfKeys < 1)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("hash indexes do not support whole-index scans")));
+
+       /* There may be more than one index qual, but we hash only the first */
+       cur = &scan->keyData[0];
+
+       /* We support only single-column hash indexes */
+       Assert(cur->sk_attno == 1);
+       /* And there's only one operator strategy, too */
+       Assert(cur->sk_strategy == HTEqualStrategyNumber);
+
+       /*
+        * If the constant in the index qual is NULL, assume it cannot match any
+        * items in the index.
+        */
+       if (cur->sk_flags & SK_ISNULL)
+               return false;
+
+       /*
+        * Okay to compute the hash key.  We want to do this before acquiring any
+        * locks, in case a user-defined hash function happens to be slow.
+        *
+        * If scankey operator is not a cross-type comparison, we can use the
+        * cached hash function; otherwise gotta look it up in the catalogs.
+        *
+        * We support the convention that sk_subtype == InvalidOid means the
+        * opclass input type; this is a hack to simplify life for ScanKeyInit().
+        */
+       if (cur->sk_subtype == rel->rd_opcintype[0] ||
+               cur->sk_subtype == InvalidOid)
+               hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
+       else
+               hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
+                                                                                  cur->sk_subtype);
+
+       so->hashso_sk_hash = hashkey;
+
+       /* Read the metapage */
+       metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+       page = BufferGetPage(metabuf, NULL, NULL,
+                                                BGP_NO_SNAPSHOT_TEST);
+       metap = HashPageGetMeta(page);
+
+       /*
+        * Loop until we get a lock on the correct target bucket.
+        */
+       for (;;)
+       {
+               /*
+                * Compute the target bucket number, and convert to block number.
+                */
+               bucket = _hash_hashkey2bucket(hashkey,
+                                                                         metap->hashm_maxbucket,
+                                                                         metap->hashm_highmask,
+                                                                         metap->hashm_lowmask);
+
+               blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+               /* Release metapage lock, but keep pin. */
+               _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+               /*
+                * If the previous iteration of this loop locked what is still the
+                * correct target bucket, we are done.  Otherwise, drop any old lock
+                * and lock what now appears to be the correct bucket.
+                */
+               if (retry)
+               {
+                       if (oldblkno == blkno)
+                               break;
+                       _hash_droplock(rel, oldblkno, HASH_SHARE);
+               }
+               _hash_getlock(rel, blkno, HASH_SHARE);
+
+               /*
+                * Reacquire metapage lock and check that no bucket split has taken
+                * place while we were awaiting the bucket lock.
+                */
+               _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+               oldblkno = blkno;
+               retry = true;
        }
-    }
-    if (ScanDirectionIsBackward(dir)) {
-       while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
-           _hash_readnext(rel, &buf, &page, &opaque);
+
+       /* done with the metapage */
+       _hash_dropbuf(rel, metabuf);
+
+       /* Update scan opaque state to show we have lock on the bucket */
+       so->hashso_bucket = bucket;
+       so->hashso_bucket_valid = true;
+       so->hashso_bucket_blkno = blkno;
+
+       /* Fetch the primary bucket page for the bucket */
+       buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
+       page = BufferGetPage(buf, NULL, NULL,
+                                                BGP_NO_SNAPSHOT_TEST);
+       opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+       Assert(opaque->hasho_bucket == bucket);
+
+       /* If a backwards scan is requested, move to the end of the chain */
+       if (ScanDirectionIsBackward(dir))
+       {
+               while (BlockNumberIsValid(opaque->hasho_nextblkno))
+                       _hash_readnext(rel, &buf, &page, &opaque);
        }
-    }
-
-    if (!_hash_step(scan, &buf, dir, metabuf)) {
-       return ((RetrieveIndexResult) NULL);
-    }
-
-    /* if we're here, _hash_step found a valid tuple */
-    current = &(scan->currentItemData);
-    offnum = ItemPointerGetOffsetNumber(current);
-    page = BufferGetPage(buf);
-    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
-    hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-    itup = &hitem->hash_itup;
-    iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
-    memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
-    res = FormRetrieveIndexResult(current, iptr);
-
-    return (res);
+
+       /* Now find the first tuple satisfying the qualification */
+       if (!_hash_step(scan, &buf, dir))
+               return false;
+
+       /* if we're here, _hash_step found a valid tuple */
+       offnum = ItemPointerGetOffsetNumber(current);
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+       so->hashso_heappos = itup->t_tid;
+
+       return true;
 }
 
 /*
- *  _hash_step() -- step to the next valid item in a scan in the bucket.
+ *     _hash_step() -- step to the next valid item in a scan in the bucket.
  *
- *     If no valid record exists in the requested direction, return
- *     false.  Else, return true and set the CurrentItemData for the
- *     scan to the right thing.
- * 
- *     'bufP' points to the buffer which contains the current page
- *     that we'll step through.
+ *             If no valid record exists in the requested direction, return
+ *             false.  Else, return true and set the hashso_curpos for the
+ *             scan to the right thing.
  *
- *     'metabuf' is released when this returns.
+ *             'bufP' points to the current buffer, which is pinned and read-locked.
+ *             On success exit, we have pin and read-lock on whichever page
+ *             contains the right item; on failure, we have released all buffers.
  */
 bool
-_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
+_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 {
-    Relation rel;
-    ItemPointer current;
-    HashScanOpaque so;
-    int allbuckets;
-    HashMetaPage metap;
-    Buffer buf;
-    Page page;
-    HashPageOpaque opaque;
-    OffsetNumber maxoff;
-    OffsetNumber offnum;
-    Bucket bucket;
-    BlockNumber blkno;
-    HashItem hitem;
-    IndexTuple itup;
-
-    rel = scan->relation;
-    current = &(scan->currentItemData);
-    so = (HashScanOpaque) scan->opaque;
-    allbuckets = (scan->numberOfKeys < 1);
-
-    metap = (HashMetaPage) BufferGetPage(metabuf);
-    _hash_checkpage((Page) metap, LH_META_PAGE);
-
-    buf = *bufP;
-    page = BufferGetPage(buf);
-    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
-    opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-
-    /*
-     * If _hash_step is called from _hash_first, current will not be
-     * valid, so we can't dereference it.  However, in that case, we
-     * presumably want to start at the beginning/end of the page...
-     */
-    maxoff = PageGetMaxOffsetNumber(page);
-    if (ItemPointerIsValid(current)) {
-       offnum = ItemPointerGetOffsetNumber(current);
-    } else {
-       offnum = InvalidOffsetNumber;
-    }
-
-    /*
-     * 'offnum' now points to the last tuple we have seen (if any).
-     *
-     * continue to step through tuples until:
-     *           1) we get to the end of the bucket chain or
-     *           2) we find a valid tuple.
-     */
-    do {
-       bucket = opaque->hasho_bucket;
-
-       switch (dir) {
-       case ForwardScanDirection:
-           if (offnum != InvalidOffsetNumber) {
-               offnum = OffsetNumberNext(offnum);      /* move forward */
-           } else {
-               offnum = FirstOffsetNumber;             /* new page */
-           }
-           while (offnum > maxoff) {
-               /*
-                * either this page is empty (maxoff ==
-                * InvalidOffsetNumber) or we ran off the end.
-                */
-               _hash_readnext(rel, &buf, &page, &opaque);
-               if (BufferIsInvalid(buf)) {     /* end of chain */
-                   if (allbuckets && bucket < metap->hashm_maxbucket) {
-                       ++bucket;
-                       blkno = BUCKET_TO_BLKNO(bucket);
-                       buf = _hash_getbuf(rel, blkno, HASH_READ);
-                       page = BufferGetPage(buf);
-                       _hash_checkpage(page, LH_BUCKET_PAGE);
-                       opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-                       Assert(opaque->hasho_bucket == bucket);
-                       while (PageIsEmpty(page) &&
-                              BlockNumberIsValid(opaque->hasho_nextblkno)) {
-                           _hash_readnext(rel, &buf, &page, &opaque);
-                       }
-                       maxoff = PageGetMaxOffsetNumber(page);
-                       offnum = FirstOffsetNumber;
-                   } else {
-                       maxoff = offnum = InvalidOffsetNumber;
-                       break;  /* while */
-                   }
-               } else {
-                   /* _hash_readnext never returns an empty page */
-                   maxoff = PageGetMaxOffsetNumber(page);
-                   offnum = FirstOffsetNumber;
+       Relation        rel = scan->indexRelation;
+       HashScanOpaque so = (HashScanOpaque) scan->opaque;
+       ItemPointer current;
+       Buffer          buf;
+       Page            page;
+       HashPageOpaque opaque;
+       OffsetNumber maxoff;
+       OffsetNumber offnum;
+       BlockNumber blkno;
+       IndexTuple      itup;
+
+       current = &(so->hashso_curpos);
+
+       buf = *bufP;
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+       opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+       /*
+        * If _hash_step is called from _hash_first, current will not be valid, so
+        * we can't dereference it.  However, in that case, we presumably want to
+        * start at the beginning/end of the page...
+        */
+       maxoff = PageGetMaxOffsetNumber(page);
+       if (ItemPointerIsValid(current))
+               offnum = ItemPointerGetOffsetNumber(current);
+       else
+               offnum = InvalidOffsetNumber;
+
+       /*
+        * 'offnum' now points to the last tuple we examined (if any).
+        *
+        * continue to step through tuples until: 1) we get to the end of the
+        * bucket chain or 2) we find a valid tuple.
+        */
+       do
+       {
+               switch (dir)
+               {
+                       case ForwardScanDirection:
+                               if (offnum != InvalidOffsetNumber)
+                                       offnum = OffsetNumberNext(offnum);      /* move forward */
+                               else
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                               }
+
+                               for (;;)
+                               {
+                                       /*
+                                        * check if we're still in the range of items with the
+                                        * target hash key
+                                        */
+                                       if (offnum <= maxoff)
+                                       {
+                                               Assert(offnum >= FirstOffsetNumber);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
+                                        */
+                                       _hash_readnext(rel, &buf, &page, &opaque);
+                                       if (BufferIsValid(buf))
+                                       {
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                                       }
+                                       else
+                                       {
+                                               /* end of bucket */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
+                                       }
+                               }
+                               break;
+
+                       case BackwardScanDirection:
+                               if (offnum != InvalidOffsetNumber)
+                                       offnum = OffsetNumberPrev(offnum);      /* move back */
+                               else
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                               }
+
+                               for (;;)
+                               {
+                                       /*
+                                        * check if we're still in the range of items with the
+                                        * target hash key
+                                        */
+                                       if (offnum >= FirstOffsetNumber)
+                                       {
+                                               Assert(offnum <= maxoff);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
+                                        */
+                                       _hash_readprev(rel, &buf, &page, &opaque);
+                                       if (BufferIsValid(buf))
+                                       {
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                                       }
+                                       else
+                                       {
+                                               /* end of bucket */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
+                                       }
+                               }
+                               break;
+
+                       default:
+                               /* NoMovementScanDirection */
+                               /* this should not be reached */
+                               itup = NULL;
+                               break;
                }
-           }
-           break;
-       case BackwardScanDirection:
-           if (offnum != InvalidOffsetNumber) {
-               offnum = OffsetNumberPrev(offnum);      /* move back */
-           } else {
-               offnum = maxoff;                        /* new page */
-           }
-           while (offnum < FirstOffsetNumber) {
-               /*
-                * either this page is empty (offnum ==
-                * InvalidOffsetNumber) or we ran off the end.
-                */
-               _hash_readprev(rel, &buf, &page, &opaque);
-               if (BufferIsInvalid(buf)) {     /* end of chain */
-                   if (allbuckets && bucket > 0) {
-                       --bucket;
-                       blkno = BUCKET_TO_BLKNO(bucket);
-                       buf = _hash_getbuf(rel, blkno, HASH_READ);
-                       page = BufferGetPage(buf);
-                       _hash_checkpage(page, LH_BUCKET_PAGE);
-                       opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-                       Assert(opaque->hasho_bucket == bucket);
-                       while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
-                           _hash_readnext(rel, &buf, &page, &opaque);
-                       }
-                       maxoff = offnum = PageGetMaxOffsetNumber(page);
-                   } else {
-                       maxoff = offnum = InvalidOffsetNumber;
-                       break;  /* while */
-                   }
-               } else {
-                   /* _hash_readprev never returns an empty page */
-                   maxoff = offnum = PageGetMaxOffsetNumber(page);
+
+               if (itup == NULL)
+               {
+                       /* we ran off the end of the bucket without finding a match */
+                       *bufP = so->hashso_curbuf = InvalidBuffer;
+                       ItemPointerSetInvalid(current);
+                       return false;
                }
-           }
-           break;
-       default:
-           /* NoMovementScanDirection */
-           /* this should not be reached */
-           break;
-       }
 
-       /* we ran off the end of the world without finding a match */
-       if (offnum == InvalidOffsetNumber) {
-           _hash_relbuf(rel, metabuf, HASH_READ);
-           *bufP = so->hashso_curbuf = InvalidBuffer;
-           ItemPointerSetInvalid(current);
-           return(false);
-       }
-       
-       /* get ready to check this tuple */
-       hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &hitem->hash_itup;
-    } while (!_hash_checkqual(scan, itup));
-   
-    /* if we made it to here, we've found a valid tuple */
-    _hash_relbuf(rel, metabuf, HASH_READ);
-    blkno = BufferGetBlockNumber(buf);
-    *bufP = so->hashso_curbuf = buf;
-    ItemPointerSet(current, blkno, offnum);
-    return(true);
+               /* check the tuple quals, loop around if not met */
+       } while (!_hash_checkqual(scan, itup));
+
+       /* if we made it to here, we've found a valid tuple */
+       blkno = BufferGetBlockNumber(buf);
+       *bufP = so->hashso_curbuf = buf;
+       ItemPointerSet(current, blkno, offnum);
+       return true;
 }