]> granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashsearch.c
9e5d7e4babe09317d3377d75a471ba4fc06b8e21
[postgresql] / src / backend / access / hash / hashsearch.c
1 /*-------------------------------------------------------------------------
2  *
3  * hashsearch.c
4  *        search code for postgres hash tables
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/access/hash/hashsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
20 #include "pgstat.h"
21 #include "utils/rel.h"
22
23
24 /*
25  *      _hash_next() -- Get the next item in a scan.
26  *
27  *              On entry, we have a valid hashso_curpos in the scan, and a
28  *              pin and read lock on the page that contains that item.
29  *              We find the next item in the scan, if any.
30  *              On success exit, we have the page containing the next item
31  *              pinned and locked.
32  */
33 bool
34 _hash_next(IndexScanDesc scan, ScanDirection dir)
35 {
36         Relation        rel = scan->indexRelation;
37         HashScanOpaque so = (HashScanOpaque) scan->opaque;
38         Buffer          buf;
39         Page            page;
40         OffsetNumber offnum;
41         ItemPointer current;
42         IndexTuple      itup;
43
44         /* we still have the buffer pinned and read-locked */
45         buf = so->hashso_curbuf;
46         Assert(BufferIsValid(buf));
47
48         /*
49          * step to next valid tuple.
50          */
51         if (!_hash_step(scan, &buf, dir))
52                 return false;
53
54         /* if we're here, _hash_step found a valid tuple */
55         current = &(so->hashso_curpos);
56         offnum = ItemPointerGetOffsetNumber(current);
57         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
58         page = BufferGetPage(buf);
59         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
60         so->hashso_heappos = itup->t_tid;
61
62         return true;
63 }
64
65 /*
66  * Advance to next page in a bucket, if any.  If we are scanning the bucket
67  * being populated during split operation then this function advances to the
68  * bucket being split after the last bucket page of bucket being populated.
69  */
70 static void
71 _hash_readnext(IndexScanDesc scan,
72                            Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
73 {
74         BlockNumber blkno;
75         Relation        rel = scan->indexRelation;
76         HashScanOpaque so = (HashScanOpaque) scan->opaque;
77         bool            block_found = false;
78
79         blkno = (*opaquep)->hasho_nextblkno;
80
81         /*
82          * Retain the pin on primary bucket page till the end of scan.  Refer the
83          * comments in _hash_first to know the reason of retaining pin.
84          */
85         if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
86                 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
87         else
88                 _hash_relbuf(rel, *bufp);
89
90         *bufp = InvalidBuffer;
91         /* check for interrupts while we're not holding any buffer lock */
92         CHECK_FOR_INTERRUPTS();
93         if (BlockNumberIsValid(blkno))
94         {
95                 *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
96                 block_found = true;
97         }
98         else if (so->hashso_buc_populated && !so->hashso_buc_split)
99         {
100                 /*
101                  * end of bucket, scan bucket being split if there was a split in
102                  * progress at the start of scan.
103                  */
104                 *bufp = so->hashso_split_bucket_buf;
105
106                 /*
107                  * buffer for bucket being split must be valid as we acquire the pin
108                  * on it before the start of scan and retain it till end of scan.
109                  */
110                 Assert(BufferIsValid(*bufp));
111
112                 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
113
114                 /*
115                  * setting hashso_buc_split to true indicates that we are scanning
116                  * bucket being split.
117                  */
118                 so->hashso_buc_split = true;
119
120                 block_found = true;
121         }
122
123         if (block_found)
124         {
125                 *pagep = BufferGetPage(*bufp);
126                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
127         }
128 }
129
130 /*
131  * Advance to previous page in a bucket, if any.  If the current scan has
132  * started during split operation then this function advances to bucket
133  * being populated after the first bucket page of bucket being split.
134  */
135 static void
136 _hash_readprev(IndexScanDesc scan,
137                            Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
138 {
139         BlockNumber blkno;
140         Relation        rel = scan->indexRelation;
141         HashScanOpaque so = (HashScanOpaque) scan->opaque;
142         bool            haveprevblk;
143
144         blkno = (*opaquep)->hasho_prevblkno;
145
146         /*
147          * Retain the pin on primary bucket page till the end of scan.  Refer the
148          * comments in _hash_first to know the reason of retaining pin.
149          */
150         if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
151         {
152                 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
153                 haveprevblk = false;
154         }
155         else
156         {
157                 _hash_relbuf(rel, *bufp);
158                 haveprevblk = true;
159         }
160
161         *bufp = InvalidBuffer;
162         /* check for interrupts while we're not holding any buffer lock */
163         CHECK_FOR_INTERRUPTS();
164
165         if (haveprevblk)
166         {
167                 Assert(BlockNumberIsValid(blkno));
168                 *bufp = _hash_getbuf(rel, blkno, HASH_READ,
169                                                          LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
170                 *pagep = BufferGetPage(*bufp);
171                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
172
173                 /*
174                  * We always maintain the pin on bucket page for whole scan operation,
175                  * so releasing the additional pin we have acquired here.
176                  */
177                 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
178                         _hash_dropbuf(rel, *bufp);
179         }
180         else if (so->hashso_buc_populated && so->hashso_buc_split)
181         {
182                 /*
183                  * end of bucket, scan bucket being populated if there was a split in
184                  * progress at the start of scan.
185                  */
186                 *bufp = so->hashso_bucket_buf;
187
188                 /*
189                  * buffer for bucket being populated must be valid as we acquire the
190                  * pin on it before the start of scan and retain it till end of scan.
191                  */
192                 Assert(BufferIsValid(*bufp));
193
194                 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
195                 *pagep = BufferGetPage(*bufp);
196                 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
197
198                 /* move to the end of bucket chain */
199                 while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
200                         _hash_readnext(scan, bufp, pagep, opaquep);
201
202                 /*
203                  * setting hashso_buc_split to false indicates that we are scanning
204                  * bucket being populated.
205                  */
206                 so->hashso_buc_split = false;
207         }
208 }
209
210 /*
211  *      _hash_first() -- Find the first item in a scan.
212  *
213  *              Find the first item in the index that
214  *              satisfies the qualification associated with the scan descriptor. On
215  *              success, the page containing the current index tuple is read locked
216  *              and pinned, and the scan's opaque data entry is updated to
217  *              include the buffer.
218  */
219 bool
220 _hash_first(IndexScanDesc scan, ScanDirection dir)
221 {
222         Relation        rel = scan->indexRelation;
223         HashScanOpaque so = (HashScanOpaque) scan->opaque;
224         ScanKey         cur;
225         uint32          hashkey;
226         Bucket          bucket;
227         Buffer          buf;
228         Page            page;
229         HashPageOpaque opaque;
230         IndexTuple      itup;
231         ItemPointer current;
232         OffsetNumber offnum;
233
234         pgstat_count_index_scan(rel);
235
236         current = &(so->hashso_curpos);
237         ItemPointerSetInvalid(current);
238
239         /*
240          * We do not support hash scans with no index qualification, because we
241          * would have to read the whole index rather than just one bucket. That
242          * creates a whole raft of problems, since we haven't got a practical way
243          * to lock all the buckets against splits or compactions.
244          */
245         if (scan->numberOfKeys < 1)
246                 ereport(ERROR,
247                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
248                                  errmsg("hash indexes do not support whole-index scans")));
249
250         /* There may be more than one index qual, but we hash only the first */
251         cur = &scan->keyData[0];
252
253         /* We support only single-column hash indexes */
254         Assert(cur->sk_attno == 1);
255         /* And there's only one operator strategy, too */
256         Assert(cur->sk_strategy == HTEqualStrategyNumber);
257
258         /*
259          * If the constant in the index qual is NULL, assume it cannot match any
260          * items in the index.
261          */
262         if (cur->sk_flags & SK_ISNULL)
263                 return false;
264
265         /*
266          * Okay to compute the hash key.  We want to do this before acquiring any
267          * locks, in case a user-defined hash function happens to be slow.
268          *
269          * If scankey operator is not a cross-type comparison, we can use the
270          * cached hash function; otherwise gotta look it up in the catalogs.
271          *
272          * We support the convention that sk_subtype == InvalidOid means the
273          * opclass input type; this is a hack to simplify life for ScanKeyInit().
274          */
275         if (cur->sk_subtype == rel->rd_opcintype[0] ||
276                 cur->sk_subtype == InvalidOid)
277                 hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
278         else
279                 hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
280                                                                                    cur->sk_subtype);
281
282         so->hashso_sk_hash = hashkey;
283
284         buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
285         page = BufferGetPage(buf);
286         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
287         bucket = opaque->hasho_bucket;
288
289         so->hashso_bucket_buf = buf;
290
291         /*
292          * If a bucket split is in progress, then while scanning the bucket being
293          * populated, we need to skip tuples that were copied from bucket being
294          * split.  We also need to maintain a pin on the bucket being split to
295          * ensure that split-cleanup work done by vacuum doesn't remove tuples
296          * from it till this scan is done.  We need to maintain a pin on the
297          * bucket being populated to ensure that vacuum doesn't squeeze that
298          * bucket till this scan is complete; otherwise, the ordering of tuples
299          * can't be maintained during forward and backward scans.  Here, we have
300          * to be cautious about locking order: first, acquire the lock on bucket
301          * being split; then, release the lock on it but not the pin; then,
302          * acquire a lock on bucket being populated and again re-verify whether
303          * the bucket split is still in progress.  Acquiring the lock on bucket
304          * being split first ensures that the vacuum waits for this scan to
305          * finish.
306          */
307         if (H_BUCKET_BEING_POPULATED(opaque))
308         {
309                 BlockNumber old_blkno;
310                 Buffer          old_buf;
311
312                 old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
313
314                 /*
315                  * release the lock on new bucket and re-acquire it after acquiring
316                  * the lock on old bucket.
317                  */
318                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
319
320                 old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
321
322                 /*
323                  * remember the split bucket buffer so as to use it later for
324                  * scanning.
325                  */
326                 so->hashso_split_bucket_buf = old_buf;
327                 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
328
329                 LockBuffer(buf, BUFFER_LOCK_SHARE);
330                 page = BufferGetPage(buf);
331                 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
332                 Assert(opaque->hasho_bucket == bucket);
333
334                 if (H_BUCKET_BEING_POPULATED(opaque))
335                         so->hashso_buc_populated = true;
336                 else
337                 {
338                         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
339                         so->hashso_split_bucket_buf = InvalidBuffer;
340                 }
341         }
342
343         /* If a backwards scan is requested, move to the end of the chain */
344         if (ScanDirectionIsBackward(dir))
345         {
346                 /*
347                  * Backward scans that start during split needs to start from end of
348                  * bucket being split.
349                  */
350                 while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
351                            (so->hashso_buc_populated && !so->hashso_buc_split))
352                         _hash_readnext(scan, &buf, &page, &opaque);
353         }
354
355         /* Now find the first tuple satisfying the qualification */
356         if (!_hash_step(scan, &buf, dir))
357                 return false;
358
359         /* if we're here, _hash_step found a valid tuple */
360         offnum = ItemPointerGetOffsetNumber(current);
361         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
362         page = BufferGetPage(buf);
363         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
364         so->hashso_heappos = itup->t_tid;
365
366         return true;
367 }
368
369 /*
370  *      _hash_step() -- step to the next valid item in a scan in the bucket.
371  *
372  *              If no valid record exists in the requested direction, return
373  *              false.  Else, return true and set the hashso_curpos for the
374  *              scan to the right thing.
375  *
376  *              Here we need to ensure that if the scan has started during split, then
377  *              skip the tuples that are moved by split while scanning bucket being
378  *              populated and then scan the bucket being split to cover all such
379  *              tuples.  This is done to ensure that we don't miss tuples in the scans
380  *              that are started during split.
381  *
382  *              'bufP' points to the current buffer, which is pinned and read-locked.
383  *              On success exit, we have pin and read-lock on whichever page
384  *              contains the right item; on failure, we have released all buffers.
385  */
386 bool
387 _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
388 {
389         Relation        rel = scan->indexRelation;
390         HashScanOpaque so = (HashScanOpaque) scan->opaque;
391         ItemPointer current;
392         Buffer          buf;
393         Page            page;
394         HashPageOpaque opaque;
395         OffsetNumber maxoff;
396         OffsetNumber offnum;
397         BlockNumber blkno;
398         IndexTuple      itup;
399
400         current = &(so->hashso_curpos);
401
402         buf = *bufP;
403         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
404         page = BufferGetPage(buf);
405         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
406
407         /*
408          * If _hash_step is called from _hash_first, current will not be valid, so
409          * we can't dereference it.  However, in that case, we presumably want to
410          * start at the beginning/end of the page...
411          */
412         maxoff = PageGetMaxOffsetNumber(page);
413         if (ItemPointerIsValid(current))
414                 offnum = ItemPointerGetOffsetNumber(current);
415         else
416                 offnum = InvalidOffsetNumber;
417
418         /*
419          * 'offnum' now points to the last tuple we examined (if any).
420          *
421          * continue to step through tuples until: 1) we get to the end of the
422          * bucket chain or 2) we find a valid tuple.
423          */
424         do
425         {
426                 switch (dir)
427                 {
428                         case ForwardScanDirection:
429                                 if (offnum != InvalidOffsetNumber)
430                                         offnum = OffsetNumberNext(offnum);      /* move forward */
431                                 else
432                                 {
433                                         /* new page, locate starting position by binary search */
434                                         offnum = _hash_binsearch(page, so->hashso_sk_hash);
435                                 }
436
437                                 for (;;)
438                                 {
439                                         /*
440                                          * check if we're still in the range of items with the
441                                          * target hash key
442                                          */
443                                         if (offnum <= maxoff)
444                                         {
445                                                 Assert(offnum >= FirstOffsetNumber);
446                                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
447
448                                                 /*
449                                                  * skip the tuples that are moved by split operation
450                                                  * for the scan that has started when split was in
451                                                  * progress
452                                                  */
453                                                 if (so->hashso_buc_populated && !so->hashso_buc_split &&
454                                                         (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
455                                                 {
456                                                         offnum = OffsetNumberNext(offnum);      /* move forward */
457                                                         continue;
458                                                 }
459
460                                                 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
461                                                         break;          /* yes, so exit for-loop */
462                                         }
463
464                                         /*
465                                          * ran off the end of this page, try the next
466                                          */
467                                         _hash_readnext(scan, &buf, &page, &opaque);
468                                         if (BufferIsValid(buf))
469                                         {
470                                                 maxoff = PageGetMaxOffsetNumber(page);
471                                                 offnum = _hash_binsearch(page, so->hashso_sk_hash);
472                                         }
473                                         else
474                                         {
475                                                 itup = NULL;
476                                                 break;  /* exit for-loop */
477                                         }
478                                 }
479                                 break;
480
481                         case BackwardScanDirection:
482                                 if (offnum != InvalidOffsetNumber)
483                                         offnum = OffsetNumberPrev(offnum);      /* move back */
484                                 else
485                                 {
486                                         /* new page, locate starting position by binary search */
487                                         offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
488                                 }
489
490                                 for (;;)
491                                 {
492                                         /*
493                                          * check if we're still in the range of items with the
494                                          * target hash key
495                                          */
496                                         if (offnum >= FirstOffsetNumber)
497                                         {
498                                                 Assert(offnum <= maxoff);
499                                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
500
501                                                 /*
502                                                  * skip the tuples that are moved by split operation
503                                                  * for the scan that has started when split was in
504                                                  * progress
505                                                  */
506                                                 if (so->hashso_buc_populated && !so->hashso_buc_split &&
507                                                         (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
508                                                 {
509                                                         offnum = OffsetNumberPrev(offnum);      /* move back */
510                                                         continue;
511                                                 }
512
513                                                 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
514                                                         break;          /* yes, so exit for-loop */
515                                         }
516
517                                         /*
518                                          * ran off the end of this page, try the next
519                                          */
520                                         _hash_readprev(scan, &buf, &page, &opaque);
521                                         if (BufferIsValid(buf))
522                                         {
523                                                 maxoff = PageGetMaxOffsetNumber(page);
524                                                 offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
525                                         }
526                                         else
527                                         {
528                                                 itup = NULL;
529                                                 break;  /* exit for-loop */
530                                         }
531                                 }
532                                 break;
533
534                         default:
535                                 /* NoMovementScanDirection */
536                                 /* this should not be reached */
537                                 itup = NULL;
538                                 break;
539                 }
540
541                 if (itup == NULL)
542                 {
543                         /*
544                          * We ran off the end of the bucket without finding a match.
545                          * Release the pin on bucket buffers.  Normally, such pins are
546                          * released at end of scan, however scrolling cursors can
547                          * reacquire the bucket lock and pin in the same scan multiple
548                          * times.
549                          */
550                         *bufP = so->hashso_curbuf = InvalidBuffer;
551                         ItemPointerSetInvalid(current);
552                         _hash_dropscanbuf(rel, so);
553                         return false;
554                 }
555
556                 /* check the tuple quals, loop around if not met */
557         } while (!_hash_checkqual(scan, itup));
558
559         /* if we made it to here, we've found a valid tuple */
560         blkno = BufferGetBlockNumber(buf);
561         *bufP = so->hashso_curbuf = buf;
562         ItemPointerSet(current, blkno, offnum);
563         return true;
564 }