granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashsearch.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashsearch.c
   4  *        search code for postgres hash tables
   5  *
   6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/hash/hashsearch.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include "access/hash.h"
  18 #include "access/relscan.h"
  19 #include "miscadmin.h"
  20 #include "pgstat.h"
  21 #include "utils/rel.h"
  22
  23
  24 /*
  25  *      _hash_next() -- Get the next item in a scan.
  26  *
  27  *              On entry, we have a valid hashso_curpos in the scan, and a
  28  *              pin and read lock on the page that contains that item.
  29  *              We find the next item in the scan, if any.
  30  *              On success exit, we have the page containing the next item
  31  *              pinned and locked.
  32  */
  33 bool
  34 _hash_next(IndexScanDesc scan, ScanDirection dir)
  35 {
  36         Relation        rel = scan->indexRelation;
  37         HashScanOpaque so = (HashScanOpaque) scan->opaque;
  38         Buffer          buf;
  39         Page            page;
  40         OffsetNumber offnum;
  41         ItemPointer current;
  42         IndexTuple      itup;
  43
  44         /* we still have the buffer pinned and read-locked */
  45         buf = so->hashso_curbuf;
  46         Assert(BufferIsValid(buf));
  47
  48         /*
  49          * step to next valid tuple.
  50          */
  51         if (!_hash_step(scan, &buf, dir))
  52                 return false;
  53
  54         /* if we're here, _hash_step found a valid tuple */
  55         current = &(so->hashso_curpos);
  56         offnum = ItemPointerGetOffsetNumber(current);
  57         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
  58         page = BufferGetPage(buf);
  59         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
  60         so->hashso_heappos = itup->t_tid;
  61
  62         return true;
  63 }
  64
  65 /*
  66  * Advance to next page in a bucket, if any.  If we are scanning the bucket
  67  * being populated during split operation then this function advances to the
  68  * bucket being split after the last bucket page of bucket being populated.
  69  */
  70 static void
  71 _hash_readnext(IndexScanDesc scan,
  72                            Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
  73 {
  74         BlockNumber blkno;
  75         Relation        rel = scan->indexRelation;
  76         HashScanOpaque so = (HashScanOpaque) scan->opaque;
  77         bool            block_found = false;
  78
  79         blkno = (*opaquep)->hasho_nextblkno;
  80
  81         /*
  82          * Retain the pin on primary bucket page till the end of scan.  Refer the
  83          * comments in _hash_first to know the reason of retaining pin.
  84          */
  85         if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
  86                 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
  87         else
  88                 _hash_relbuf(rel, *bufp);
  89
  90         *bufp = InvalidBuffer;
  91         /* check for interrupts while we're not holding any buffer lock */
  92         CHECK_FOR_INTERRUPTS();
  93         if (BlockNumberIsValid(blkno))
  94         {
  95                 *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
  96                 block_found = true;
  97         }
  98         else if (so->hashso_buc_populated && !so->hashso_buc_split)
  99         {
 100                 /*
 101                  * end of bucket, scan bucket being split if there was a split in
 102                  * progress at the start of scan.
 103                  */
 104                 *bufp = so->hashso_split_bucket_buf;
 105
 106                 /*
 107                  * buffer for bucket being split must be valid as we acquire the pin
 108                  * on it before the start of scan and retain it till end of scan.
 109                  */
 110                 Assert(BufferIsValid(*bufp));
 111
 112                 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
 113
 114                 /*
 115                  * setting hashso_buc_split to true indicates that we are scanning
 116                  * bucket being split.
 117                  */
 118                 so->hashso_buc_split = true;
 119
 120                 block_found = true;
 121         }
 122
 123         if (block_found)
 124         {
 125                 *pagep = BufferGetPage(*bufp);
 126                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
 127         }
 128 }
 129
 130 /*
 131  * Advance to previous page in a bucket, if any.  If the current scan has
 132  * started during split operation then this function advances to bucket
 133  * being populated after the first bucket page of bucket being split.
 134  */
 135 static void
 136 _hash_readprev(IndexScanDesc scan,
 137                            Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
 138 {
 139         BlockNumber blkno;
 140         Relation        rel = scan->indexRelation;
 141         HashScanOpaque so = (HashScanOpaque) scan->opaque;
 142         bool            haveprevblk;
 143
 144         blkno = (*opaquep)->hasho_prevblkno;
 145
 146         /*
 147          * Retain the pin on primary bucket page till the end of scan.  Refer the
 148          * comments in _hash_first to know the reason of retaining pin.
 149          */
 150         if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
 151         {
 152                 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
 153                 haveprevblk = false;
 154         }
 155         else
 156         {
 157                 _hash_relbuf(rel, *bufp);
 158                 haveprevblk = true;
 159         }
 160
 161         *bufp = InvalidBuffer;
 162         /* check for interrupts while we're not holding any buffer lock */
 163         CHECK_FOR_INTERRUPTS();
 164
 165         if (haveprevblk)
 166         {
 167                 Assert(BlockNumberIsValid(blkno));
 168                 *bufp = _hash_getbuf(rel, blkno, HASH_READ,
 169                                                          LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 170                 *pagep = BufferGetPage(*bufp);
 171                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
 172
 173                 /*
 174                  * We always maintain the pin on bucket page for whole scan operation,
 175                  * so releasing the additional pin we have acquired here.
 176                  */
 177                 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
 178                         _hash_dropbuf(rel, *bufp);
 179         }
 180         else if (so->hashso_buc_populated && so->hashso_buc_split)
 181         {
 182                 /*
 183                  * end of bucket, scan bucket being populated if there was a split in
 184                  * progress at the start of scan.
 185                  */
 186                 *bufp = so->hashso_bucket_buf;
 187
 188                 /*
 189                  * buffer for bucket being populated must be valid as we acquire the
 190                  * pin on it before the start of scan and retain it till end of scan.
 191                  */
 192                 Assert(BufferIsValid(*bufp));
 193
 194                 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
 195                 *pagep = BufferGetPage(*bufp);
 196                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
 197
 198                 /* move to the end of bucket chain */
 199                 while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
 200                         _hash_readnext(scan, bufp, pagep, opaquep);
 201
 202                 /*
 203                  * setting hashso_buc_split to false indicates that we are scanning
 204                  * bucket being populated.
 205                  */
 206                 so->hashso_buc_split = false;
 207         }
 208 }
 209
 210 /*
 211  *      _hash_first() -- Find the first item in a scan.
 212  *
 213  *              Find the first item in the index that
 214  *              satisfies the qualification associated with the scan descriptor. On
 215  *              success, the page containing the current index tuple is read locked
 216  *              and pinned, and the scan's opaque data entry is updated to
 217  *              include the buffer.
 218  */
 219 bool
 220 _hash_first(IndexScanDesc scan, ScanDirection dir)
 221 {
 222         Relation        rel = scan->indexRelation;
 223         HashScanOpaque so = (HashScanOpaque) scan->opaque;
 224         ScanKey         cur;
 225         uint32          hashkey;
 226         Bucket          bucket;
 227         Buffer          buf;
 228         Page            page;
 229         HashPageOpaque opaque;
 230         IndexTuple      itup;
 231         ItemPointer current;
 232         OffsetNumber offnum;
 233
 234         pgstat_count_index_scan(rel);
 235
 236         current = &(so->hashso_curpos);
 237         ItemPointerSetInvalid(current);
 238
 239         /*
 240          * We do not support hash scans with no index qualification, because we
 241          * would have to read the whole index rather than just one bucket. That
 242          * creates a whole raft of problems, since we haven't got a practical way
 243          * to lock all the buckets against splits or compactions.
 244          */
 245         if (scan->numberOfKeys < 1)
 246                 ereport(ERROR,
 247                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 248                                  errmsg("hash indexes do not support whole-index scans")));
 249
 250         /* There may be more than one index qual, but we hash only the first */
 251         cur = &scan->keyData[0];
 252
 253         /* We support only single-column hash indexes */
 254         Assert(cur->sk_attno == 1);
 255         /* And there's only one operator strategy, too */
 256         Assert(cur->sk_strategy == HTEqualStrategyNumber);
 257
 258         /*
 259          * If the constant in the index qual is NULL, assume it cannot match any
 260          * items in the index.
 261          */
 262         if (cur->sk_flags & SK_ISNULL)
 263                 return false;
 264
 265         /*
 266          * Okay to compute the hash key.  We want to do this before acquiring any
 267          * locks, in case a user-defined hash function happens to be slow.
 268          *
 269          * If scankey operator is not a cross-type comparison, we can use the
 270          * cached hash function; otherwise gotta look it up in the catalogs.
 271          *
 272          * We support the convention that sk_subtype == InvalidOid means the
 273          * opclass input type; this is a hack to simplify life for ScanKeyInit().
 274          */
 275         if (cur->sk_subtype == rel->rd_opcintype[0] ||
 276                 cur->sk_subtype == InvalidOid)
 277                 hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
 278         else
 279                 hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
 280                                                                                    cur->sk_subtype);
 281
 282         so->hashso_sk_hash = hashkey;
 283
 284         buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
 285         page = BufferGetPage(buf);
 286         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 287         bucket = opaque->hasho_bucket;
 288
 289         so->hashso_bucket_buf = buf;
 290
 291         /*
 292          * If a bucket split is in progress, then while scanning the bucket being
 293          * populated, we need to skip tuples that were copied from bucket being
 294          * split.  We also need to maintain a pin on the bucket being split to
 295          * ensure that split-cleanup work done by vacuum doesn't remove tuples
 296          * from it till this scan is done.  We need to maintain a pin on the
 297          * bucket being populated to ensure that vacuum doesn't squeeze that
 298          * bucket till this scan is complete; otherwise, the ordering of tuples
 299          * can't be maintained during forward and backward scans.  Here, we have
 300          * to be cautious about locking order: first, acquire the lock on bucket
 301          * being split; then, release the lock on it but not the pin; then,
 302          * acquire a lock on bucket being populated and again re-verify whether
 303          * the bucket split is still in progress.  Acquiring the lock on bucket
 304          * being split first ensures that the vacuum waits for this scan to
 305          * finish.
 306          */
 307         if (H_BUCKET_BEING_POPULATED(opaque))
 308         {
 309                 BlockNumber old_blkno;
 310                 Buffer          old_buf;
 311
 312                 old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
 313
 314                 /*
 315                  * release the lock on new bucket and re-acquire it after acquiring
 316                  * the lock on old bucket.
 317                  */
 318                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 319
 320                 old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
 321
 322                 /*
 323                  * remember the split bucket buffer so as to use it later for
 324                  * scanning.
 325                  */
 326                 so->hashso_split_bucket_buf = old_buf;
 327                 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
 328
 329                 LockBuffer(buf, BUFFER_LOCK_SHARE);
 330                 page = BufferGetPage(buf);
 331                 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 332                 Assert(opaque->hasho_bucket == bucket);
 333
 334                 if (H_BUCKET_BEING_POPULATED(opaque))
 335                         so->hashso_buc_populated = true;
 336                 else
 337                 {
 338                         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
 339                         so->hashso_split_bucket_buf = InvalidBuffer;
 340                 }
 341         }
 342
 343         /* If a backwards scan is requested, move to the end of the chain */
 344         if (ScanDirectionIsBackward(dir))
 345         {
 346                 /*
 347                  * Backward scans that start during split needs to start from end of
 348                  * bucket being split.
 349                  */
 350                 while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
 351                            (so->hashso_buc_populated && !so->hashso_buc_split))
 352                         _hash_readnext(scan, &buf, &page, &opaque);
 353         }
 354
 355         /* Now find the first tuple satisfying the qualification */
 356         if (!_hash_step(scan, &buf, dir))
 357                 return false;
 358
 359         /* if we're here, _hash_step found a valid tuple */
 360         offnum = ItemPointerGetOffsetNumber(current);
 361         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 362         page = BufferGetPage(buf);
 363         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 364         so->hashso_heappos = itup->t_tid;
 365
 366         return true;
 367 }
 368
 369 /*
 370  *      _hash_step() -- step to the next valid item in a scan in the bucket.
 371  *
 372  *              If no valid record exists in the requested direction, return
 373  *              false.  Else, return true and set the hashso_curpos for the
 374  *              scan to the right thing.
 375  *
 376  *              Here we need to ensure that if the scan has started during split, then
 377  *              skip the tuples that are moved by split while scanning bucket being
 378  *              populated and then scan the bucket being split to cover all such
 379  *              tuples.  This is done to ensure that we don't miss tuples in the scans
 380  *              that are started during split.
 381  *
 382  *              'bufP' points to the current buffer, which is pinned and read-locked.
 383  *              On success exit, we have pin and read-lock on whichever page
 384  *              contains the right item; on failure, we have released all buffers.
 385  */
 386 bool
 387 _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 388 {
 389         Relation        rel = scan->indexRelation;
 390         HashScanOpaque so = (HashScanOpaque) scan->opaque;
 391         ItemPointer current;
 392         Buffer          buf;
 393         Page            page;
 394         HashPageOpaque opaque;
 395         OffsetNumber maxoff;
 396         OffsetNumber offnum;
 397         BlockNumber blkno;
 398         IndexTuple      itup;
 399
 400         current = &(so->hashso_curpos);
 401
 402         buf = *bufP;
 403         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 404         page = BufferGetPage(buf);
 405         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 406
 407         /*
 408          * If _hash_step is called from _hash_first, current will not be valid, so
 409          * we can't dereference it.  However, in that case, we presumably want to
 410          * start at the beginning/end of the page...
 411          */
 412         maxoff = PageGetMaxOffsetNumber(page);
 413         if (ItemPointerIsValid(current))
 414                 offnum = ItemPointerGetOffsetNumber(current);
 415         else
 416                 offnum = InvalidOffsetNumber;
 417
 418         /*
 419          * 'offnum' now points to the last tuple we examined (if any).
 420          *
 421          * continue to step through tuples until: 1) we get to the end of the
 422          * bucket chain or 2) we find a valid tuple.
 423          */
 424         do
 425         {
 426                 switch (dir)
 427                 {
 428                         case ForwardScanDirection:
 429                                 if (offnum != InvalidOffsetNumber)
 430                                         offnum = OffsetNumberNext(offnum);      /* move forward */
 431                                 else
 432                                 {
 433                                         /* new page, locate starting position by binary search */
 434                                         offnum = _hash_binsearch(page, so->hashso_sk_hash);
 435                                 }
 436
 437                                 for (;;)
 438                                 {
 439                                         /*
 440                                          * check if we're still in the range of items with the
 441                                          * target hash key
 442                                          */
 443                                         if (offnum <= maxoff)
 444                                         {
 445                                                 Assert(offnum >= FirstOffsetNumber);
 446                                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 447
 448                                                 /*
 449                                                  * skip the tuples that are moved by split operation
 450                                                  * for the scan that has started when split was in
 451                                                  * progress
 452                                                  */
 453                                                 if (so->hashso_buc_populated && !so->hashso_buc_split &&
 454                                                         (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
 455                                                 {
 456                                                         offnum = OffsetNumberNext(offnum);      /* move forward */
 457                                                         continue;
 458                                                 }
 459
 460                                                 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
 461                                                         break;          /* yes, so exit for-loop */
 462                                         }
 463
 464                                         /*
 465                                          * ran off the end of this page, try the next
 466                                          */
 467                                         _hash_readnext(scan, &buf, &page, &opaque);
 468                                         if (BufferIsValid(buf))
 469                                         {
 470                                                 maxoff = PageGetMaxOffsetNumber(page);
 471                                                 offnum = _hash_binsearch(page, so->hashso_sk_hash);
 472                                         }
 473                                         else
 474                                         {
 475                                                 itup = NULL;
 476                                                 break;  /* exit for-loop */
 477                                         }
 478                                 }
 479                                 break;
 480
 481                         case BackwardScanDirection:
 482                                 if (offnum != InvalidOffsetNumber)
 483                                         offnum = OffsetNumberPrev(offnum);      /* move back */
 484                                 else
 485                                 {
 486                                         /* new page, locate starting position by binary search */
 487                                         offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
 488                                 }
 489
 490                                 for (;;)
 491                                 {
 492                                         /*
 493                                          * check if we're still in the range of items with the
 494                                          * target hash key
 495                                          */
 496                                         if (offnum >= FirstOffsetNumber)
 497                                         {
 498                                                 Assert(offnum <= maxoff);
 499                                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 500
 501                                                 /*
 502                                                  * skip the tuples that are moved by split operation
 503                                                  * for the scan that has started when split was in
 504                                                  * progress
 505                                                  */
 506                                                 if (so->hashso_buc_populated && !so->hashso_buc_split &&
 507                                                         (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
 508                                                 {
 509                                                         offnum = OffsetNumberPrev(offnum);      /* move back */
 510                                                         continue;
 511                                                 }
 512
 513                                                 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
 514                                                         break;          /* yes, so exit for-loop */
 515                                         }
 516
 517                                         /*
 518                                          * ran off the end of this page, try the next
 519                                          */
 520                                         _hash_readprev(scan, &buf, &page, &opaque);
 521                                         if (BufferIsValid(buf))
 522                                         {
 523                                                 maxoff = PageGetMaxOffsetNumber(page);
 524                                                 offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
 525                                         }
 526                                         else
 527                                         {
 528                                                 itup = NULL;
 529                                                 break;  /* exit for-loop */
 530                                         }
 531                                 }
 532                                 break;
 533
 534                         default:
 535                                 /* NoMovementScanDirection */
 536                                 /* this should not be reached */
 537                                 itup = NULL;
 538                                 break;
 539                 }
 540
 541                 if (itup == NULL)
 542                 {
 543                         /*
 544                          * We ran off the end of the bucket without finding a match.
 545                          * Release the pin on bucket buffers.  Normally, such pins are
 546                          * released at end of scan, however scrolling cursors can
 547                          * reacquire the bucket lock and pin in the same scan multiple
 548                          * times.
 549                          */
 550                         *bufP = so->hashso_curbuf = InvalidBuffer;
 551                         ItemPointerSetInvalid(current);
 552                         _hash_dropscanbuf(rel, so);
 553                         return false;
 554                 }
 555
 556                 /* check the tuple quals, loop around if not met */
 557         } while (!_hash_checkqual(scan, itup));
 558
 559         /* if we made it to here, we've found a valid tuple */
 560         blkno = BufferGetBlockNumber(buf);
 561         *bufP = so->hashso_curbuf = buf;
 562         ItemPointerSet(current, blkno, offnum);
 563         return true;
 564 }