1 /*-------------------------------------------------------------------------
4 * search code for postgres hash tables
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashsearch.c
13 *-------------------------------------------------------------------------
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
21 #include "utils/rel.h"
25 * _hash_next() -- Get the next item in a scan.
27 * On entry, we have a valid hashso_curpos in the scan, and a
28 * pin and read lock on the page that contains that item.
29 * We find the next item in the scan, if any.
30 * On success exit, we have the page containing the next item
34 _hash_next(IndexScanDesc scan, ScanDirection dir)
36 Relation rel = scan->indexRelation;
37 HashScanOpaque so = (HashScanOpaque) scan->opaque;
44 /* we still have the buffer pinned and read-locked */
45 buf = so->hashso_curbuf;
46 Assert(BufferIsValid(buf));
49 * step to next valid tuple.
51 if (!_hash_step(scan, &buf, dir))
54 /* if we're here, _hash_step found a valid tuple */
55 current = &(so->hashso_curpos);
56 offnum = ItemPointerGetOffsetNumber(current);
57 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
58 page = BufferGetPage(buf);
59 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
60 so->hashso_heappos = itup->t_tid;
66 * Advance to next page in a bucket, if any. If we are scanning the bucket
67 * being populated during split operation then this function advances to the
68 * bucket being split after the last bucket page of bucket being populated.
71 _hash_readnext(IndexScanDesc scan,
72 Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
75 Relation rel = scan->indexRelation;
76 HashScanOpaque so = (HashScanOpaque) scan->opaque;
77 bool block_found = false;
79 blkno = (*opaquep)->hasho_nextblkno;
82 * Retain the pin on primary bucket page till the end of scan. Refer the
83 * comments in _hash_first to know the reason of retaining pin.
85 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
86 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
88 _hash_relbuf(rel, *bufp);
90 *bufp = InvalidBuffer;
91 /* check for interrupts while we're not holding any buffer lock */
92 CHECK_FOR_INTERRUPTS();
93 if (BlockNumberIsValid(blkno))
95 *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
98 else if (so->hashso_buc_populated && !so->hashso_buc_split)
101 * end of bucket, scan bucket being split if there was a split in
102 * progress at the start of scan.
104 *bufp = so->hashso_split_bucket_buf;
107 * buffer for bucket being split must be valid as we acquire the pin
108 * on it before the start of scan and retain it till end of scan.
110 Assert(BufferIsValid(*bufp));
112 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
115 * setting hashso_buc_split to true indicates that we are scanning
116 * bucket being split.
118 so->hashso_buc_split = true;
125 *pagep = BufferGetPage(*bufp);
126 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
131 * Advance to previous page in a bucket, if any. If the current scan has
132 * started during split operation then this function advances to bucket
133 * being populated after the first bucket page of bucket being split.
136 _hash_readprev(IndexScanDesc scan,
137 Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
140 Relation rel = scan->indexRelation;
141 HashScanOpaque so = (HashScanOpaque) scan->opaque;
144 blkno = (*opaquep)->hasho_prevblkno;
147 * Retain the pin on primary bucket page till the end of scan. Refer the
148 * comments in _hash_first to know the reason of retaining pin.
150 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
152 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
157 _hash_relbuf(rel, *bufp);
161 *bufp = InvalidBuffer;
162 /* check for interrupts while we're not holding any buffer lock */
163 CHECK_FOR_INTERRUPTS();
167 Assert(BlockNumberIsValid(blkno));
168 *bufp = _hash_getbuf(rel, blkno, HASH_READ,
169 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
170 *pagep = BufferGetPage(*bufp);
171 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
174 * We always maintain the pin on bucket page for whole scan operation,
175 * so releasing the additional pin we have acquired here.
177 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
178 _hash_dropbuf(rel, *bufp);
180 else if (so->hashso_buc_populated && so->hashso_buc_split)
183 * end of bucket, scan bucket being populated if there was a split in
184 * progress at the start of scan.
186 *bufp = so->hashso_bucket_buf;
189 * buffer for bucket being populated must be valid as we acquire the
190 * pin on it before the start of scan and retain it till end of scan.
192 Assert(BufferIsValid(*bufp));
194 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
195 *pagep = BufferGetPage(*bufp);
196 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
198 /* move to the end of bucket chain */
199 while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
200 _hash_readnext(scan, bufp, pagep, opaquep);
203 * setting hashso_buc_split to false indicates that we are scanning
204 * bucket being populated.
206 so->hashso_buc_split = false;
211 * _hash_first() -- Find the first item in a scan.
213 * Find the first item in the index that
214 * satisfies the qualification associated with the scan descriptor. On
215 * success, the page containing the current index tuple is read locked
216 * and pinned, and the scan's opaque data entry is updated to
217 * include the buffer.
220 _hash_first(IndexScanDesc scan, ScanDirection dir)
222 Relation rel = scan->indexRelation;
223 HashScanOpaque so = (HashScanOpaque) scan->opaque;
229 HashPageOpaque opaque;
234 pgstat_count_index_scan(rel);
236 current = &(so->hashso_curpos);
237 ItemPointerSetInvalid(current);
240 * We do not support hash scans with no index qualification, because we
241 * would have to read the whole index rather than just one bucket. That
242 * creates a whole raft of problems, since we haven't got a practical way
243 * to lock all the buckets against splits or compactions.
245 if (scan->numberOfKeys < 1)
247 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
248 errmsg("hash indexes do not support whole-index scans")));
250 /* There may be more than one index qual, but we hash only the first */
251 cur = &scan->keyData[0];
253 /* We support only single-column hash indexes */
254 Assert(cur->sk_attno == 1);
255 /* And there's only one operator strategy, too */
256 Assert(cur->sk_strategy == HTEqualStrategyNumber);
259 * If the constant in the index qual is NULL, assume it cannot match any
260 * items in the index.
262 if (cur->sk_flags & SK_ISNULL)
266 * Okay to compute the hash key. We want to do this before acquiring any
267 * locks, in case a user-defined hash function happens to be slow.
269 * If scankey operator is not a cross-type comparison, we can use the
270 * cached hash function; otherwise gotta look it up in the catalogs.
272 * We support the convention that sk_subtype == InvalidOid means the
273 * opclass input type; this is a hack to simplify life for ScanKeyInit().
275 if (cur->sk_subtype == rel->rd_opcintype[0] ||
276 cur->sk_subtype == InvalidOid)
277 hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
279 hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
282 so->hashso_sk_hash = hashkey;
284 buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
285 page = BufferGetPage(buf);
286 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
287 bucket = opaque->hasho_bucket;
289 so->hashso_bucket_buf = buf;
292 * If a bucket split is in progress, then while scanning the bucket being
293 * populated, we need to skip tuples that were copied from bucket being
294 * split. We also need to maintain a pin on the bucket being split to
295 * ensure that split-cleanup work done by vacuum doesn't remove tuples
296 * from it till this scan is done. We need to maintain a pin on the
297 * bucket being populated to ensure that vacuum doesn't squeeze that
298 * bucket till this scan is complete; otherwise, the ordering of tuples
299 * can't be maintained during forward and backward scans. Here, we have
300 * to be cautious about locking order: first, acquire the lock on bucket
301 * being split; then, release the lock on it but not the pin; then,
302 * acquire a lock on bucket being populated and again re-verify whether
303 * the bucket split is still in progress. Acquiring the lock on bucket
304 * being split first ensures that the vacuum waits for this scan to
307 if (H_BUCKET_BEING_POPULATED(opaque))
309 BlockNumber old_blkno;
312 old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
315 * release the lock on new bucket and re-acquire it after acquiring
316 * the lock on old bucket.
318 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
320 old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
323 * remember the split bucket buffer so as to use it later for
326 so->hashso_split_bucket_buf = old_buf;
327 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
329 LockBuffer(buf, BUFFER_LOCK_SHARE);
330 page = BufferGetPage(buf);
331 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
332 Assert(opaque->hasho_bucket == bucket);
334 if (H_BUCKET_BEING_POPULATED(opaque))
335 so->hashso_buc_populated = true;
338 _hash_dropbuf(rel, so->hashso_split_bucket_buf);
339 so->hashso_split_bucket_buf = InvalidBuffer;
343 /* If a backwards scan is requested, move to the end of the chain */
344 if (ScanDirectionIsBackward(dir))
347 * Backward scans that start during split needs to start from end of
348 * bucket being split.
350 while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
351 (so->hashso_buc_populated && !so->hashso_buc_split))
352 _hash_readnext(scan, &buf, &page, &opaque);
355 /* Now find the first tuple satisfying the qualification */
356 if (!_hash_step(scan, &buf, dir))
359 /* if we're here, _hash_step found a valid tuple */
360 offnum = ItemPointerGetOffsetNumber(current);
361 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
362 page = BufferGetPage(buf);
363 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
364 so->hashso_heappos = itup->t_tid;
370 * _hash_step() -- step to the next valid item in a scan in the bucket.
372 * If no valid record exists in the requested direction, return
373 * false. Else, return true and set the hashso_curpos for the
374 * scan to the right thing.
376 * Here we need to ensure that if the scan has started during split, then
377 * skip the tuples that are moved by split while scanning bucket being
378 * populated and then scan the bucket being split to cover all such
379 * tuples. This is done to ensure that we don't miss tuples in the scans
380 * that are started during split.
382 * 'bufP' points to the current buffer, which is pinned and read-locked.
383 * On success exit, we have pin and read-lock on whichever page
384 * contains the right item; on failure, we have released all buffers.
387 _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
389 Relation rel = scan->indexRelation;
390 HashScanOpaque so = (HashScanOpaque) scan->opaque;
394 HashPageOpaque opaque;
400 current = &(so->hashso_curpos);
403 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
404 page = BufferGetPage(buf);
405 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
408 * If _hash_step is called from _hash_first, current will not be valid, so
409 * we can't dereference it. However, in that case, we presumably want to
410 * start at the beginning/end of the page...
412 maxoff = PageGetMaxOffsetNumber(page);
413 if (ItemPointerIsValid(current))
414 offnum = ItemPointerGetOffsetNumber(current);
416 offnum = InvalidOffsetNumber;
419 * 'offnum' now points to the last tuple we examined (if any).
421 * continue to step through tuples until: 1) we get to the end of the
422 * bucket chain or 2) we find a valid tuple.
428 case ForwardScanDirection:
429 if (offnum != InvalidOffsetNumber)
430 offnum = OffsetNumberNext(offnum); /* move forward */
433 /* new page, locate starting position by binary search */
434 offnum = _hash_binsearch(page, so->hashso_sk_hash);
440 * check if we're still in the range of items with the
443 if (offnum <= maxoff)
445 Assert(offnum >= FirstOffsetNumber);
446 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
449 * skip the tuples that are moved by split operation
450 * for the scan that has started when split was in
453 if (so->hashso_buc_populated && !so->hashso_buc_split &&
454 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
456 offnum = OffsetNumberNext(offnum); /* move forward */
460 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
461 break; /* yes, so exit for-loop */
465 * ran off the end of this page, try the next
467 _hash_readnext(scan, &buf, &page, &opaque);
468 if (BufferIsValid(buf))
470 maxoff = PageGetMaxOffsetNumber(page);
471 offnum = _hash_binsearch(page, so->hashso_sk_hash);
476 break; /* exit for-loop */
481 case BackwardScanDirection:
482 if (offnum != InvalidOffsetNumber)
483 offnum = OffsetNumberPrev(offnum); /* move back */
486 /* new page, locate starting position by binary search */
487 offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
493 * check if we're still in the range of items with the
496 if (offnum >= FirstOffsetNumber)
498 Assert(offnum <= maxoff);
499 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
502 * skip the tuples that are moved by split operation
503 * for the scan that has started when split was in
506 if (so->hashso_buc_populated && !so->hashso_buc_split &&
507 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
509 offnum = OffsetNumberPrev(offnum); /* move back */
513 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
514 break; /* yes, so exit for-loop */
518 * ran off the end of this page, try the next
520 _hash_readprev(scan, &buf, &page, &opaque);
521 if (BufferIsValid(buf))
523 maxoff = PageGetMaxOffsetNumber(page);
524 offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
529 break; /* exit for-loop */
535 /* NoMovementScanDirection */
536 /* this should not be reached */
544 * We ran off the end of the bucket without finding a match.
545 * Release the pin on bucket buffers. Normally, such pins are
546 * released at end of scan, however scrolling cursors can
547 * reacquire the bucket lock and pin in the same scan multiple
550 *bufP = so->hashso_curbuf = InvalidBuffer;
551 ItemPointerSetInvalid(current);
552 _hash_dropscanbuf(rel, so);
556 /* check the tuple quals, loop around if not met */
557 } while (!_hash_checkqual(scan, itup));
559 /* if we made it to here, we've found a valid tuple */
560 blkno = BufferGetBlockNumber(buf);
561 *bufP = so->hashso_curbuf = buf;
562 ItemPointerSet(current, blkno, offnum);