1 /*-------------------------------------------------------------------------
4 * Overflow page management code for the Postgres hash access method
6 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashovfl.c
14 * Overflow pages look like ordinary relation pages.
16 *-------------------------------------------------------------------------
20 #include "access/hash.h"
21 #include "utils/rel.h"
24 static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
25 static uint32 _hash_firstfreebit(uint32 map);
29 * Convert overflow page bit number (its index in the free-page bitmaps)
30 * to block number within the index.
33 bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
35 uint32 splitnum = metap->hashm_ovflpoint;
38 /* Convert zero-based bitnumber to 1-based page number */
41 /* Determine the split number for this page (must be >= 1) */
43 i < splitnum && ovflbitnum > metap->hashm_spares[i];
48 * Convert to absolute page number by adding the number of bucket pages
49 * that exist before this split point.
51 return (BlockNumber) ((1 << i) + ovflbitnum);
55 * Convert overflow page block number to bit number for free-page bitmap.
58 blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
60 uint32 splitnum = metap->hashm_ovflpoint;
64 /* Determine the split number containing this page */
65 for (i = 1; i <= splitnum; i++)
67 if (ovflblkno <= (BlockNumber) (1 << i))
69 bitnum = ovflblkno - (1 << i);
70 if (bitnum <= metap->hashm_spares[i])
71 return bitnum - 1; /* -1 to convert 1-based to 0-based */
74 elog(ERROR, "invalid overflow block number %u", ovflblkno);
75 return 0; /* keep compiler quiet */
81 * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
83 * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
84 * dropped before exiting (we assume the caller is not interested in 'buf'
85 * anymore). The returned overflow page will be pinned and write-locked;
86 * it is guaranteed to be empty.
88 * The caller must hold a pin, but no lock, on the metapage buffer.
89 * That buffer is returned in the same state.
91 * The caller must hold at least share lock on the bucket, to ensure that
92 * no one else tries to compact the bucket meanwhile. This guarantees that
93 * 'buf' won't stop being part of the bucket while it's unlocked.
95 * NB: since this could be executed concurrently by multiple processes,
96 * one should not assume that the returned overflow page will be the
97 * immediate successor of the originally passed 'buf'. Additional overflow
98 * pages might have been added to the bucket chain in between.
101 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
106 HashPageOpaque pageopaque;
107 HashPageOpaque ovflopaque;
109 /* allocate and lock an empty overflow page */
110 ovflbuf = _hash_getovflpage(rel, metabuf);
113 * Write-lock the tail page. It is okay to hold two buffer locks here
114 * since there cannot be anyone else contending for access to ovflbuf.
116 _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);
118 /* probably redundant... */
119 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
121 /* loop to find current tail page, in case someone else inserted too */
124 BlockNumber nextblkno;
126 page = BufferGetPage(buf);
127 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
128 nextblkno = pageopaque->hasho_nextblkno;
130 if (!BlockNumberIsValid(nextblkno))
133 /* we assume we do not need to write the unmodified page */
134 _hash_relbuf(rel, buf);
136 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
139 /* now that we have correct backlink, initialize new overflow page */
140 ovflpage = BufferGetPage(ovflbuf);
141 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
142 ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
143 ovflopaque->hasho_nextblkno = InvalidBlockNumber;
144 ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
145 ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
146 ovflopaque->hasho_page_id = HASHO_PAGE_ID;
148 MarkBufferDirty(ovflbuf);
150 /* logically chain overflow page to previous page */
151 pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
152 _hash_wrtbuf(rel, buf);
158 * _hash_getovflpage()
160 * Find an available overflow page and return it. The returned buffer
161 * is pinned and write-locked, and has had _hash_pageinit() applied,
162 * but it is caller's responsibility to fill the special space.
164 * The caller must hold a pin, but no lock, on the metapage buffer.
165 * That buffer is left in the same state at exit.
168 _hash_getovflpage(Relation rel, Buffer metabuf)
174 uint32 orig_firstfree;
176 uint32 *freep = NULL;
185 /* Get exclusive lock on the meta page */
186 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
188 _hash_checkpage(rel, metabuf, LH_META_PAGE);
189 metap = HashPageGetMeta(BufferGetPage(metabuf));
191 /* start search at hashm_firstfree */
192 orig_firstfree = metap->hashm_firstfree;
193 first_page = orig_firstfree >> BMPG_SHIFT(metap);
194 bit = orig_firstfree & BMPG_MASK(metap);
196 j = bit / BITS_PER_MAP;
197 bit &= ~(BITS_PER_MAP - 1);
199 /* outer loop iterates once per bitmap page */
202 BlockNumber mapblkno;
206 /* want to end search with the last existing overflow page */
207 splitnum = metap->hashm_ovflpoint;
208 max_ovflpg = metap->hashm_spares[splitnum] - 1;
209 last_page = max_ovflpg >> BMPG_SHIFT(metap);
210 last_bit = max_ovflpg & BMPG_MASK(metap);
215 Assert(i < metap->hashm_nmaps);
216 mapblkno = metap->hashm_mapp[i];
219 last_inpage = last_bit;
221 last_inpage = BMPGSZ_BIT(metap) - 1;
223 /* Release exclusive lock on metapage while reading bitmap page */
224 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
226 mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
227 mappage = BufferGetPage(mapbuf);
228 freep = HashPageGetBitmap(mappage);
230 for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
232 if (freep[j] != ALL_SET)
236 /* No free space here, try to advance to next map page */
237 _hash_relbuf(rel, mapbuf);
239 j = 0; /* scan from start of next map page */
242 /* Reacquire exclusive lock on the meta page */
243 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
247 * No free pages --- have to extend the relation to add an overflow page.
248 * First, check to see if we have to add a new bitmap page too.
250 if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
253 * We create the new bitmap page with all pages marked "in use".
254 * Actually two pages in the new bitmap's range will exist
255 * immediately: the bitmap page itself, and the following page which
256 * is the one we return to the caller. Both of these are correctly
257 * marked "in use". Subsequent pages do not exist yet, but it is
258 * convenient to pre-mark them as "in use" too.
260 bit = metap->hashm_spares[splitnum];
261 _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
262 metap->hashm_spares[splitnum]++;
267 * Nothing to do here; since the page will be past the last used page,
268 * we know its bitmap bit was preinitialized to "in use".
272 /* Calculate address of the new overflow page */
273 bit = metap->hashm_spares[splitnum];
274 blkno = bitno_to_blkno(metap, bit);
277 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
278 * relation length stays in sync with ours. XXX It's annoying to do this
279 * with metapage write lock held; would be better to use a lock that
280 * doesn't block incoming searches.
282 newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
284 metap->hashm_spares[splitnum]++;
287 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
288 * changing it if someone moved it while we were searching bitmap pages.
290 if (metap->hashm_firstfree == orig_firstfree)
291 metap->hashm_firstfree = bit + 1;
293 /* Write updated metapage and release lock, but not pin */
294 _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
299 /* convert bit to bit number within page */
300 bit += _hash_firstfreebit(freep[j]);
302 /* mark page "in use" in the bitmap */
304 _hash_wrtbuf(rel, mapbuf);
306 /* Reacquire exclusive lock on the meta page */
307 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
309 /* convert bit to absolute bit number */
310 bit += (i << BMPG_SHIFT(metap));
312 /* Calculate address of the recycled overflow page */
313 blkno = bitno_to_blkno(metap, bit);
316 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
317 * changing it if someone moved it while we were searching bitmap pages.
319 if (metap->hashm_firstfree == orig_firstfree)
321 metap->hashm_firstfree = bit + 1;
323 /* Write updated metapage and release lock, but not pin */
324 _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
328 /* We didn't change the metapage, so no need to write */
329 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
332 /* Fetch, init, and return the recycled page */
333 return _hash_getinitbuf(rel, blkno);
337 * _hash_firstfreebit()
339 * Return the number of the first bit that is not set in the word 'map'.
342 _hash_firstfreebit(uint32 map)
348 for (i = 0; i < BITS_PER_MAP; i++)
355 elog(ERROR, "firstfreebit found no free bit");
357 return 0; /* keep compiler quiet */
361 * _hash_freeovflpage() -
363 * Remove this overflow page from its bucket's chain, and mark the page as
364 * free. On entry, ovflbuf is write-locked; it is released before exiting.
366 * Since this function is invoked in VACUUM, we provide an access strategy
367 * parameter that controls fetches of the bucket pages.
369 * Returns the block number of the page that followed the given page
370 * in the bucket, or InvalidBlockNumber if no following page.
372 * NB: caller must not hold lock on metapage, nor on either page that's
373 * adjacent in the bucket chain. The caller had better hold exclusive lock
374 * on the bucket, too.
377 _hash_freeovflpage(Relation rel, Buffer ovflbuf,
378 BufferAccessStrategy bstrategy)
383 BlockNumber ovflblkno;
384 BlockNumber prevblkno;
386 BlockNumber nextblkno;
387 HashPageOpaque ovflopaque;
394 Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
396 /* Get information from the doomed page */
397 _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
398 ovflblkno = BufferGetBlockNumber(ovflbuf);
399 ovflpage = BufferGetPage(ovflbuf);
400 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
401 nextblkno = ovflopaque->hasho_nextblkno;
402 prevblkno = ovflopaque->hasho_prevblkno;
403 bucket = ovflopaque->hasho_bucket;
406 * Zero the page for debugging's sake; then write and release it. (Note:
407 * if we failed to zero the page here, we'd have problems with the Assert
408 * in _hash_pageinit() when the page is reused.)
410 MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
411 _hash_wrtbuf(rel, ovflbuf);
414 * Fix up the bucket chain. this is a doubly-linked list, so we must fix
415 * up the bucket chain members behind and ahead of the overflow page being
416 * deleted. No concurrency issues since we hold exclusive lock on the
419 if (BlockNumberIsValid(prevblkno))
421 Buffer prevbuf = _hash_getbuf_with_strategy(rel,
424 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
426 Page prevpage = BufferGetPage(prevbuf);
427 HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
429 Assert(prevopaque->hasho_bucket == bucket);
430 prevopaque->hasho_nextblkno = nextblkno;
431 _hash_wrtbuf(rel, prevbuf);
433 if (BlockNumberIsValid(nextblkno))
435 Buffer nextbuf = _hash_getbuf_with_strategy(rel,
440 Page nextpage = BufferGetPage(nextbuf);
441 HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
443 Assert(nextopaque->hasho_bucket == bucket);
444 nextopaque->hasho_prevblkno = prevblkno;
445 _hash_wrtbuf(rel, nextbuf);
448 /* Note: bstrategy is intentionally not used for metapage and bitmap */
450 /* Read the metapage so we can determine which bitmap page to use */
451 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
452 metap = HashPageGetMeta(BufferGetPage(metabuf));
454 /* Identify which bit to set */
455 ovflbitno = blkno_to_bitno(metap, ovflblkno);
457 bitmappage = ovflbitno >> BMPG_SHIFT(metap);
458 bitmapbit = ovflbitno & BMPG_MASK(metap);
460 if (bitmappage >= metap->hashm_nmaps)
461 elog(ERROR, "invalid overflow bit number %u", ovflbitno);
462 blkno = metap->hashm_mapp[bitmappage];
464 /* Release metapage lock while we access the bitmap page */
465 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
467 /* Clear the bitmap bit to indicate that this overflow page is free */
468 mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
469 mappage = BufferGetPage(mapbuf);
470 freep = HashPageGetBitmap(mappage);
471 Assert(ISSET(freep, bitmapbit));
472 CLRBIT(freep, bitmapbit);
473 _hash_wrtbuf(rel, mapbuf);
475 /* Get write-lock on metapage to update firstfree */
476 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
478 /* if this is now the first free page, update hashm_firstfree */
479 if (ovflbitno < metap->hashm_firstfree)
481 metap->hashm_firstfree = ovflbitno;
482 _hash_wrtbuf(rel, metabuf);
486 /* no need to change metapage */
487 _hash_relbuf(rel, metabuf);
497 * Initialize a new bitmap page. The metapage has a write-lock upon
498 * entering the function, and must be written by caller after return.
500 * 'blkno' is the block number of the new bitmap page.
502 * All bits in the new bitmap page are set to "1", indicating "in use".
505 _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
514 * It is okay to write-lock the new bitmap page while holding metapage
515 * write lock, because no one else could be contending for the new page.
516 * Also, the metapage lock makes it safe to extend the index using
519 * There is some loss of concurrency in possibly doing I/O for the new
520 * page while holding the metapage lock, but this path is taken so seldom
521 * that it's not worth worrying about.
523 buf = _hash_getnewbuf(rel, blkno, forkNum);
524 pg = BufferGetPage(buf);
526 /* initialize the page's special space */
527 op = (HashPageOpaque) PageGetSpecialPointer(pg);
528 op->hasho_prevblkno = InvalidBlockNumber;
529 op->hasho_nextblkno = InvalidBlockNumber;
530 op->hasho_bucket = -1;
531 op->hasho_flag = LH_BITMAP_PAGE;
532 op->hasho_page_id = HASHO_PAGE_ID;
534 /* set all of the bits to 1 */
535 freep = HashPageGetBitmap(pg);
536 MemSet(freep, 0xFF, BMPGSZ_BYTE(metap));
538 /* write out the new bitmap page (releasing write lock and pin) */
539 _hash_wrtbuf(rel, buf);
541 /* add the new bitmap page to the metapage's list of bitmaps */
542 /* metapage already has a write lock */
543 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
545 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
546 errmsg("out of overflow pages in hash index \"%s\"",
547 RelationGetRelationName(rel))));
549 metap->hashm_mapp[metap->hashm_nmaps] = blkno;
551 metap->hashm_nmaps++;
556 * _hash_squeezebucket(rel, bucket)
558 * Try to squeeze the tuples onto pages occurring earlier in the
559 * bucket chain in an attempt to free overflow pages. When we start
560 * the "squeezing", the page from which we start taking tuples (the
561 * "read" page) is the last bucket in the bucket chain and the page
562 * onto which we start squeezing tuples (the "write" page) is the
563 * first page in the bucket chain. The read page works backward and
564 * the write page works forward; the procedure terminates when the
565 * read page and write page are the same page.
567 * At completion of this procedure, it is guaranteed that all pages in
568 * the bucket are nonempty, unless the bucket is totally empty (in
569 * which case all overflow pages will be freed). The original implementation
570 * required that to be true on entry as well, but it's a lot easier for
571 * callers to leave empty overflow pages and let this guy clean it up.
573 * Caller must hold exclusive lock on the target bucket. This allows
574 * us to safely lock multiple pages in the bucket.
576 * Since this function is invoked in VACUUM, we provide an access strategy
577 * parameter that controls fetches of the bucket pages.
580 _hash_squeezebucket(Relation rel,
582 BlockNumber bucket_blkno,
583 BufferAccessStrategy bstrategy)
591 HashPageOpaque wopaque;
592 HashPageOpaque ropaque;
596 * start squeezing into the base bucket page.
598 wblkno = bucket_blkno;
599 wbuf = _hash_getbuf_with_strategy(rel,
604 wpage = BufferGetPage(wbuf);
605 wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
608 * if there aren't any overflow pages, there's nothing to squeeze.
610 if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
612 _hash_relbuf(rel, wbuf);
617 * Find the last page in the bucket chain by starting at the base bucket
618 * page and working forward. Note: we assume that a hash bucket chain is
619 * usually smaller than the buffer ring being used by VACUUM, else using
620 * the access strategy here would be counterproductive.
622 rbuf = InvalidBuffer;
626 rblkno = ropaque->hasho_nextblkno;
627 if (rbuf != InvalidBuffer)
628 _hash_relbuf(rel, rbuf);
629 rbuf = _hash_getbuf_with_strategy(rel,
634 rpage = BufferGetPage(rbuf);
635 ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
636 Assert(ropaque->hasho_bucket == bucket);
637 } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
640 * squeeze the tuples.
645 OffsetNumber roffnum;
646 OffsetNumber maxroffnum;
647 OffsetNumber deletable[MaxOffsetNumber];
650 /* Scan each tuple in "read" page */
651 maxroffnum = PageGetMaxOffsetNumber(rpage);
652 for (roffnum = FirstOffsetNumber;
653 roffnum <= maxroffnum;
654 roffnum = OffsetNumberNext(roffnum))
659 itup = (IndexTuple) PageGetItem(rpage,
660 PageGetItemId(rpage, roffnum));
661 itemsz = IndexTupleDSize(*itup);
662 itemsz = MAXALIGN(itemsz);
665 * Walk up the bucket chain, looking for a page big enough for
666 * this item. Exit if we reach the read page.
668 while (PageGetFreeSpace(wpage) < itemsz)
670 Assert(!PageIsEmpty(wpage));
672 wblkno = wopaque->hasho_nextblkno;
673 Assert(BlockNumberIsValid(wblkno));
676 _hash_wrtbuf(rel, wbuf);
678 _hash_relbuf(rel, wbuf);
680 /* nothing more to do if we reached the read page */
681 if (rblkno == wblkno)
685 /* Delete tuples we already moved off read page */
686 PageIndexMultiDelete(rpage, deletable, ndeletable);
687 _hash_wrtbuf(rel, rbuf);
690 _hash_relbuf(rel, rbuf);
694 wbuf = _hash_getbuf_with_strategy(rel,
699 wpage = BufferGetPage(wbuf);
700 wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
701 Assert(wopaque->hasho_bucket == bucket);
706 * we have found room so insert on the "write" page, being careful
707 * to preserve hashkey ordering. (If we insert many tuples into
708 * the same "write" page it would be worth qsort'ing instead of
709 * doing repeated _hash_pgaddtup.)
711 (void) _hash_pgaddtup(rel, wbuf, itemsz, itup);
714 /* remember tuple for deletion from "read" page */
715 deletable[ndeletable++] = roffnum;
719 * If we reach here, there are no live tuples on the "read" page ---
720 * it was empty when we got to it, or we moved them all. So we can
721 * just free the page without bothering with deleting tuples
722 * individually. Then advance to the previous "read" page.
724 * Tricky point here: if our read and write pages are adjacent in the
725 * bucket chain, our write lock on wbuf will conflict with
726 * _hash_freeovflpage's attempt to update the sibling links of the
727 * removed page. However, in that case we are done anyway, so we can
728 * simply drop the write lock before calling _hash_freeovflpage.
730 rblkno = ropaque->hasho_prevblkno;
731 Assert(BlockNumberIsValid(rblkno));
733 /* are we freeing the page adjacent to wbuf? */
734 if (rblkno == wblkno)
736 /* yes, so release wbuf lock first */
738 _hash_wrtbuf(rel, wbuf);
740 _hash_relbuf(rel, wbuf);
741 /* free this overflow page (releases rbuf) */
742 _hash_freeovflpage(rel, rbuf, bstrategy);
747 /* free this overflow page, then get the previous one */
748 _hash_freeovflpage(rel, rbuf, bstrategy);
750 rbuf = _hash_getbuf_with_strategy(rel,
755 rpage = BufferGetPage(rbuf);
756 ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
757 Assert(ropaque->hasho_bucket == bucket);