Modify BufferGetPage() to prepare for "snapshot too old" feature

[postgresql] / src / backend / access / hash / hashsearch.c
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c

index 76ad5d318497152d60a52d682d6338ff772625c9..dd1f464e53addfa00e02a00615f2cfa3c07bc898 100644 (file)
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -3,25 +3,28 @@
   * hashsearch.c
   *       search code for postgres hash tables
   *
- * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.36 2004/08/29 04:12:18 momjian Exp $
+ *       src/backend/access/hash/hashsearch.c
   *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
  #include "access/hash.h"
-#include "storage/lmgr.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/rel.h"
  
  
  /*
   *     _hash_next() -- Get the next item in a scan.
   *
- *             On entry, we have a valid currentItemData in the scan, and a
+ *             On entry, we have a valid hashso_curpos in the scan, and a
   *             pin and read lock on the page that contains that item.
   *             We find the next item in the scan, if any.
   *             On success exit, we have the page containing the next item
@@ -36,7 +39,6 @@ _hash_next(IndexScanDesc scan, ScanDirection dir)
         Page            page;
         OffsetNumber offnum;
         ItemPointer current;
-       HashItem        hitem;
         IndexTuple      itup;
  
         /* we still have the buffer pinned and read-locked */
@@ -50,13 +52,12 @@ _hash_next(IndexScanDesc scan, ScanDirection dir)
                 return false;
  
         /* if we're here, _hash_step found a valid tuple */
-       current = &(scan->currentItemData);
+       current = &(so->hashso_curpos);
         offnum = ItemPointerGetOffsetNumber(current);
-       page = BufferGetPage(buf);
-       _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-       hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &hitem->hash_itup;
-       scan->xs_ctup.t_self = itup->t_tid;
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+       so->hashso_heappos = itup->t_tid;
  
         return true;
  }
@@ -73,11 +74,12 @@ _hash_readnext(Relation rel,
         blkno = (*opaquep)->hasho_nextblkno;
         _hash_relbuf(rel, *bufp);
         *bufp = InvalidBuffer;
+       /* check for interrupts while we're not holding any buffer lock */
+       CHECK_FOR_INTERRUPTS();
         if (BlockNumberIsValid(blkno))
         {
-               *bufp = _hash_getbuf(rel, blkno, HASH_READ);
-               *pagep = BufferGetPage(*bufp);
-               _hash_checkpage(rel, *pagep, LH_OVERFLOW_PAGE);
+               *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+               *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
         }
  }
@@ -94,11 +96,13 @@ _hash_readprev(Relation rel,
         blkno = (*opaquep)->hasho_prevblkno;
         _hash_relbuf(rel, *bufp);
         *bufp = InvalidBuffer;
+       /* check for interrupts while we're not holding any buffer lock */
+       CHECK_FOR_INTERRUPTS();
         if (BlockNumberIsValid(blkno))
         {
-               *bufp = _hash_getbuf(rel, blkno, HASH_READ);
-               *pagep = BufferGetPage(*bufp);
-               _hash_checkpage(rel, *pagep, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+               *bufp = _hash_getbuf(rel, blkno, HASH_READ,
+                                                        LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+               *pagep = BufferGetPage(*bufp, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
         }
  }
@@ -117,76 +121,119 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
  {
         Relation        rel = scan->indexRelation;
         HashScanOpaque so = (HashScanOpaque) scan->opaque;
+       ScanKey         cur;
         uint32          hashkey;
         Bucket          bucket;
         BlockNumber blkno;
+       BlockNumber oldblkno = InvalidBuffer;
+       bool            retry = false;
         Buffer          buf;
         Buffer          metabuf;
         Page            page;
         HashPageOpaque opaque;
         HashMetaPage metap;
-       HashItem        hitem;
         IndexTuple      itup;
         ItemPointer current;
         OffsetNumber offnum;
  
-       current = &(scan->currentItemData);
+       pgstat_count_index_scan(rel);
+
+       current = &(so->hashso_curpos);
         ItemPointerSetInvalid(current);
  
         /*
-        * We do not support hash scans with no index qualification, because
-        * we would have to read the whole index rather than just one bucket.
-        * That creates a whole raft of problems, since we haven't got a
-        * practical way to lock all the buckets against splits or compactions.
+        * We do not support hash scans with no index qualification, because we
+        * would have to read the whole index rather than just one bucket. That
+        * creates a whole raft of problems, since we haven't got a practical way
+        * to lock all the buckets against splits or compactions.
          */
         if (scan->numberOfKeys < 1)
                 ereport(ERROR,
                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                  errmsg("hash indexes do not support whole-index scans")));
  
+       /* There may be more than one index qual, but we hash only the first */
+       cur = &scan->keyData[0];
+
+       /* We support only single-column hash indexes */
+       Assert(cur->sk_attno == 1);
+       /* And there's only one operator strategy, too */
+       Assert(cur->sk_strategy == HTEqualStrategyNumber);
+
         /*
-        * If the constant in the index qual is NULL, assume it cannot match
-        * any items in the index.
+        * If the constant in the index qual is NULL, assume it cannot match any
+        * items in the index.
          */
-       if (scan->keyData[0].sk_flags & SK_ISNULL)
+       if (cur->sk_flags & SK_ISNULL)
                 return false;
  
         /*
-        * Okay to compute the hash key.  We want to do this before acquiring
-        * any locks, in case a user-defined hash function happens to be slow.
+        * Okay to compute the hash key.  We want to do this before acquiring any
+        * locks, in case a user-defined hash function happens to be slow.
+        *
+        * If scankey operator is not a cross-type comparison, we can use the
+        * cached hash function; otherwise gotta look it up in the catalogs.
+        *
+        * We support the convention that sk_subtype == InvalidOid means the
+        * opclass input type; this is a hack to simplify life for ScanKeyInit().
          */
-       hashkey = _hash_datum2hashkey(rel, scan->keyData[0].sk_argument);
+       if (cur->sk_subtype == rel->rd_opcintype[0] ||
+               cur->sk_subtype == InvalidOid)
+               hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
+       else
+               hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
+                                                                                  cur->sk_subtype);
  
-       /*
-        * Acquire shared split lock so we can compute the target bucket
-        * safely (see README).
-        */
-       _hash_getlock(rel, 0, HASH_SHARE);
+       so->hashso_sk_hash = hashkey;
  
         /* Read the metapage */
-       metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
-       _hash_checkpage(rel, (Page) metap, LH_META_PAGE);
+       metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+       page = BufferGetPage(metabuf, NULL, NULL,
+                                                BGP_NO_SNAPSHOT_TEST);
+       metap = HashPageGetMeta(page);
  
         /*
-        * Compute the target bucket number, and convert to block number.
+        * Loop until we get a lock on the correct target bucket.
          */
-       bucket = _hash_hashkey2bucket(hashkey,
-                                                                 metap->hashm_maxbucket,
-                                                                 metap->hashm_highmask,
-                                                                 metap->hashm_lowmask);
-
-       blkno = BUCKET_TO_BLKNO(metap, bucket);
+       for (;;)
+       {
+               /*
+                * Compute the target bucket number, and convert to block number.
+                */
+               bucket = _hash_hashkey2bucket(hashkey,
+                                                                         metap->hashm_maxbucket,
+                                                                         metap->hashm_highmask,
+                                                                         metap->hashm_lowmask);
+
+               blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+               /* Release metapage lock, but keep pin. */
+               _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+               /*
+                * If the previous iteration of this loop locked what is still the
+                * correct target bucket, we are done.  Otherwise, drop any old lock
+                * and lock what now appears to be the correct bucket.
+                */
+               if (retry)
+               {
+                       if (oldblkno == blkno)
+                               break;
+                       _hash_droplock(rel, oldblkno, HASH_SHARE);
+               }
+               _hash_getlock(rel, blkno, HASH_SHARE);
+
+               /*
+                * Reacquire metapage lock and check that no bucket split has taken
+                * place while we were awaiting the bucket lock.
+                */
+               _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+               oldblkno = blkno;
+               retry = true;
+       }
  
         /* done with the metapage */
-       _hash_relbuf(rel, metabuf);
-
-       /*
-        * Acquire share lock on target bucket; then we can release split lock.
-        */
-       _hash_getlock(rel, blkno, HASH_SHARE);
-
-       _hash_droplock(rel, 0, HASH_SHARE);
+       _hash_dropbuf(rel, metabuf);
  
         /* Update scan opaque state to show we have lock on the bucket */
         so->hashso_bucket = bucket;
@@ -194,9 +241,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
         so->hashso_bucket_blkno = blkno;
  
         /* Fetch the primary bucket page for the bucket */
-       buf = _hash_getbuf(rel, blkno, HASH_READ);
-       page = BufferGetPage(buf);
-       _hash_checkpage(rel, page, LH_BUCKET_PAGE);
+       buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
+       page = BufferGetPage(buf, NULL, NULL,
+                                                BGP_NO_SNAPSHOT_TEST);
         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
         Assert(opaque->hasho_bucket == bucket);
  
@@ -213,11 +260,10 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
  
         /* if we're here, _hash_step found a valid tuple */
         offnum = ItemPointerGetOffsetNumber(current);
-       page = BufferGetPage(buf);
-       _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-       hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &hitem->hash_itup;
-       scan->xs_ctup.t_self = itup->t_tid;
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+       so->hashso_heappos = itup->t_tid;
  
         return true;
  }
@@ -226,7 +272,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
   *     _hash_step() -- step to the next valid item in a scan in the bucket.
   *
   *             If no valid record exists in the requested direction, return
- *             false.  Else, return true and set the CurrentItemData for the
+ *             false.  Else, return true and set the hashso_curpos for the
   *             scan to the right thing.
   *
   *             'bufP' points to the current buffer, which is pinned and read-locked.
@@ -244,23 +290,20 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
         HashPageOpaque opaque;
         OffsetNumber maxoff;
         OffsetNumber offnum;
-       Bucket          bucket;
         BlockNumber blkno;
-       HashItem        hitem;
         IndexTuple      itup;
  
-       current = &(scan->currentItemData);
+       current = &(so->hashso_curpos);
  
         buf = *bufP;
-       page = BufferGetPage(buf);
-       _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+       page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-       bucket = opaque->hasho_bucket;
  
         /*
-        * If _hash_step is called from _hash_first, current will not be
-        * valid, so we can't dereference it.  However, in that case, we
-        * presumably want to start at the beginning/end of the page...
+        * If _hash_step is called from _hash_first, current will not be valid, so
+        * we can't dereference it.  However, in that case, we presumably want to
+        * start at the beginning/end of the page...
          */
         maxoff = PageGetMaxOffsetNumber(page);
         if (ItemPointerIsValid(current))
@@ -269,7 +312,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                 offnum = InvalidOffsetNumber;
  
         /*
-        * 'offnum' now points to the last tuple we have seen (if any).
+        * 'offnum' now points to the last tuple we examined (if any).
          *
          * continue to step through tuples until: 1) we get to the end of the
          * bucket chain or 2) we find a valid tuple.
@@ -282,26 +325,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                 if (offnum != InvalidOffsetNumber)
                                         offnum = OffsetNumberNext(offnum);      /* move forward */
                                 else
-                                       offnum = FirstOffsetNumber; /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                               }
  
-                               while (offnum > maxoff)
+                               for (;;)
                                 {
                                         /*
-                                        * either this page is empty
-                                        * (maxoff == InvalidOffsetNumber)
-                                        * or we ran off the end.
+                                        * check if we're still in the range of items with the
+                                        * target hash key
+                                        */
+                                       if (offnum <= maxoff)
+                                       {
+                                               Assert(offnum >= FirstOffsetNumber);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                          */
                                         _hash_readnext(rel, &buf, &page, &opaque);
                                         if (BufferIsValid(buf))
                                         {
                                                 maxoff = PageGetMaxOffsetNumber(page);
-                                               offnum = FirstOffsetNumber;
+                                               offnum = _hash_binsearch(page, so->hashso_sk_hash);
                                         }
                                         else
                                         {
                                                 /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                         }
                                 }
                                 break;
@@ -310,25 +366,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                 if (offnum != InvalidOffsetNumber)
                                         offnum = OffsetNumberPrev(offnum);      /* move back */
                                 else
-                                       offnum = maxoff;        /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                               }
  
-                               while (offnum < FirstOffsetNumber)
+                               for (;;)
                                 {
                                         /*
-                                        * either this page is empty
-                                        * (offnum == InvalidOffsetNumber)
-                                        * or we ran off the end.
+                                        * check if we're still in the range of items with the
+                                        * target hash key
+                                        */
+                                       if (offnum >= FirstOffsetNumber)
+                                       {
+                                               Assert(offnum <= maxoff);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                          */
                                         _hash_readprev(rel, &buf, &page, &opaque);
                                         if (BufferIsValid(buf))
                                         {
-                                               maxoff = offnum = PageGetMaxOffsetNumber(page);
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
                                         }
                                         else
                                         {
                                                 /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                         }
                                 }
                                 break;
@@ -336,20 +406,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                         default:
                                 /* NoMovementScanDirection */
                                 /* this should not be reached */
+                               itup = NULL;
                                 break;
                 }
  
-               /* we ran off the end of the world without finding a match */
-               if (offnum == InvalidOffsetNumber)
+               if (itup == NULL)
                 {
+                       /* we ran off the end of the bucket without finding a match */
                         *bufP = so->hashso_curbuf = InvalidBuffer;
                         ItemPointerSetInvalid(current);
                         return false;
                 }
  
-               /* get ready to check this tuple */
-               hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
-               itup = &hitem->hash_itup;
+               /* check the tuple quals, loop around if not met */
         } while (!_hash_checkqual(scan, itup));
  
         /* if we made it to here, we've found a valid tuple */