1 /*-------------------------------------------------------------------------
4 * Implementation of Margo Seltzer's Hashing package for postgres.
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hash.c
14 * This file contains only the public interface routines.
16 *-------------------------------------------------------------------------
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "catalog/index.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "optimizer/plancat.h"
28 #include "utils/builtins.h"
29 #include "utils/index_selfuncs.h"
30 #include "utils/rel.h"
31 #include "miscadmin.h"
34 /* Working state for hashbuild and its callback */
37 HSpool *spool; /* NULL if not using spooling */
38 double indtuples; /* # tuples accepted into index */
41 static void hashbuildCallback(Relation index,
50 * Hash handler function: return IndexAmRoutine with access method parameters
54 hashhandler(PG_FUNCTION_ARGS)
56 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
58 amroutine->amstrategies = HTMaxStrategyNumber;
59 amroutine->amsupport = HASHNProcs;
60 amroutine->amcanorder = false;
61 amroutine->amcanorderbyop = false;
62 amroutine->amcanbackward = true;
63 amroutine->amcanunique = false;
64 amroutine->amcanmulticol = false;
65 amroutine->amoptionalkey = false;
66 amroutine->amsearcharray = false;
67 amroutine->amsearchnulls = false;
68 amroutine->amstorage = false;
69 amroutine->amclusterable = false;
70 amroutine->ampredlocks = false;
71 amroutine->amcanparallel = false;
72 amroutine->amkeytype = INT4OID;
74 amroutine->ambuild = hashbuild;
75 amroutine->ambuildempty = hashbuildempty;
76 amroutine->aminsert = hashinsert;
77 amroutine->ambulkdelete = hashbulkdelete;
78 amroutine->amvacuumcleanup = hashvacuumcleanup;
79 amroutine->amcanreturn = NULL;
80 amroutine->amcostestimate = hashcostestimate;
81 amroutine->amoptions = hashoptions;
82 amroutine->amproperty = NULL;
83 amroutine->amvalidate = hashvalidate;
84 amroutine->ambeginscan = hashbeginscan;
85 amroutine->amrescan = hashrescan;
86 amroutine->amgettuple = hashgettuple;
87 amroutine->amgetbitmap = hashgetbitmap;
88 amroutine->amendscan = hashendscan;
89 amroutine->ammarkpos = NULL;
90 amroutine->amrestrpos = NULL;
91 amroutine->amestimateparallelscan = NULL;
92 amroutine->aminitparallelscan = NULL;
93 amroutine->amparallelrescan = NULL;
95 PG_RETURN_POINTER(amroutine);
99 * hashbuild() -- build a new hash index.
102 hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
104 IndexBuildResult *result;
105 BlockNumber relpages;
110 HashBuildState buildstate;
113 * We expect to be called exactly once for any index relation. If that's
114 * not the case, big trouble's what we have.
116 if (RelationGetNumberOfBlocks(index) != 0)
117 elog(ERROR, "index \"%s\" already contains data",
118 RelationGetRelationName(index));
120 /* Estimate the number of rows currently present in the table */
121 estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
123 /* Initialize the hash index metadata page and initial buckets */
124 num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
127 * If we just insert the tuples into the index in scan order, then
128 * (assuming their hash codes are pretty random) there will be no locality
129 * of access to the index, and if the index is bigger than available RAM
130 * then we'll thrash horribly. To prevent that scenario, we can sort the
131 * tuples by (expected) bucket number. However, such a sort is useless
132 * overhead when the index does fit in RAM. We choose to sort if the
133 * initial index size exceeds maintenance_work_mem, or the number of
134 * buffers usable for the index, whichever is less. (Limiting by the
135 * number of buffers should reduce thrashing between PG buffers and kernel
136 * buffers, which seems useful even if no physical I/O results. Limiting
137 * by maintenance_work_mem is useful to allow easy testing of the sort
138 * code path, and may be useful to DBAs as an additional control knob.)
140 * NOTE: this test will need adjustment if a bucket is ever different from
141 * one page. Also, "initial index size" accounting does not include the
142 * metapage, nor the first bitmap page.
144 sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
145 if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
146 sort_threshold = Min(sort_threshold, NBuffers);
148 sort_threshold = Min(sort_threshold, NLocBuffer);
150 if (num_buckets >= (uint32) sort_threshold)
151 buildstate.spool = _h_spoolinit(heap, index, num_buckets);
153 buildstate.spool = NULL;
155 /* prepare to build the index */
156 buildstate.indtuples = 0;
158 /* do the heap scan */
159 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
160 hashbuildCallback, (void *) &buildstate);
162 if (buildstate.spool)
164 /* sort the tuples and insert them into the index */
165 _h_indexbuild(buildstate.spool);
166 _h_spooldestroy(buildstate.spool);
172 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
174 result->heap_tuples = reltuples;
175 result->index_tuples = buildstate.indtuples;
181 * hashbuildempty() -- build an empty hash index in the initialization fork
184 hashbuildempty(Relation index)
186 _hash_init(index, 0, INIT_FORKNUM);
190 * Per-tuple callback from IndexBuildHeapScan
193 hashbuildCallback(Relation index,
200 HashBuildState *buildstate = (HashBuildState *) state;
201 Datum index_values[1];
202 bool index_isnull[1];
205 /* convert data to a hash key; on failure, do not insert anything */
206 if (!_hash_convert_tuple(index,
208 index_values, index_isnull))
211 /* Either spool the tuple for sorting, or just put it into the index */
212 if (buildstate->spool)
213 _h_spool(buildstate->spool, &htup->t_self,
214 index_values, index_isnull);
217 /* form an index tuple and point it at the heap tuple */
218 itup = index_form_tuple(RelationGetDescr(index),
219 index_values, index_isnull);
220 itup->t_tid = htup->t_self;
221 _hash_doinsert(index, itup);
225 buildstate->indtuples += 1;
229 * hashinsert() -- insert an index tuple into a hash table.
231 * Hash on the heap tuple's key, form an index tuple with hash code.
232 * Find the appropriate location for the new tuple, and put it there.
235 hashinsert(Relation rel, Datum *values, bool *isnull,
236 ItemPointer ht_ctid, Relation heapRel,
237 IndexUniqueCheck checkUnique,
238 IndexInfo *indexInfo)
240 Datum index_values[1];
241 bool index_isnull[1];
244 /* convert data to a hash key; on failure, do not insert anything */
245 if (!_hash_convert_tuple(rel,
247 index_values, index_isnull))
250 /* form an index tuple and point it at the heap tuple */
251 itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
252 itup->t_tid = *ht_ctid;
254 _hash_doinsert(rel, itup);
263 * hashgettuple() -- Get the next tuple in the scan.
266 hashgettuple(IndexScanDesc scan, ScanDirection dir)
268 HashScanOpaque so = (HashScanOpaque) scan->opaque;
269 Relation rel = scan->indexRelation;
276 /* Hash indexes are always lossy since we store only the hash code */
277 scan->xs_recheck = true;
280 * We hold pin but not lock on current buffer while outside the hash AM.
281 * Reacquire the read lock here.
283 if (BufferIsValid(so->hashso_curbuf))
284 LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
287 * If we've already initialized this scan, we can just advance it in the
288 * appropriate direction. If we haven't done so yet, we call a routine to
289 * get the first item in the scan.
291 current = &(so->hashso_curpos);
292 if (ItemPointerIsValid(current))
295 * An insertion into the current index page could have happened while
296 * we didn't have read lock on it. Re-find our position by looking
297 * for the TID we previously returned. (Because we hold a pin on the
298 * primary bucket page, no deletions or splits could have occurred;
299 * therefore we can expect that the TID still exists in the current
300 * index page, at an offset >= where we were.)
302 OffsetNumber maxoffnum;
304 buf = so->hashso_curbuf;
305 Assert(BufferIsValid(buf));
306 page = BufferGetPage(buf);
309 * We don't need test for old snapshot here as the current buffer is
310 * pinned, so vacuum can't clean the page.
312 maxoffnum = PageGetMaxOffsetNumber(page);
313 for (offnum = ItemPointerGetOffsetNumber(current);
315 offnum = OffsetNumberNext(offnum))
319 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
320 if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
323 if (offnum > maxoffnum)
324 elog(ERROR, "failed to re-find scan position within index \"%s\"",
325 RelationGetRelationName(rel));
326 ItemPointerSetOffsetNumber(current, offnum);
329 * Check to see if we should kill the previously-fetched tuple.
331 if (scan->kill_prior_tuple)
334 * Yes, so mark it by setting the LP_DEAD state in the item flags.
336 ItemIdMarkDead(PageGetItemId(page, offnum));
339 * Since this can be redone later if needed, mark as a hint.
341 MarkBufferDirtyHint(buf, true);
345 * Now continue the scan.
347 res = _hash_next(scan, dir);
350 res = _hash_first(scan, dir);
353 * Skip killed tuples if asked to.
355 if (scan->ignore_killed_tuples)
359 offnum = ItemPointerGetOffsetNumber(current);
360 page = BufferGetPage(so->hashso_curbuf);
361 if (!ItemIdIsDead(PageGetItemId(page, offnum)))
363 res = _hash_next(scan, dir);
367 /* Release read lock on current buffer, but keep it pinned */
368 if (BufferIsValid(so->hashso_curbuf))
369 LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
371 /* Return current heap TID on success */
372 scan->xs_ctup.t_self = so->hashso_heappos;
379 * hashgetbitmap() -- get all tuples at once
382 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
384 HashScanOpaque so = (HashScanOpaque) scan->opaque;
388 res = _hash_first(scan, ForwardScanDirection);
395 * Skip killed tuples if asked to.
397 if (scan->ignore_killed_tuples)
402 offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
403 page = BufferGetPage(so->hashso_curbuf);
404 add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
409 /* Save tuple ID, and continue scanning */
412 /* Note we mark the tuple ID as requiring recheck */
413 tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
417 res = _hash_next(scan, ForwardScanDirection);
425 * hashbeginscan() -- start a scan on a hash index
428 hashbeginscan(Relation rel, int nkeys, int norderbys)
433 /* no order by operators allowed */
434 Assert(norderbys == 0);
436 scan = RelationGetIndexScan(rel, nkeys, norderbys);
438 so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
439 so->hashso_curbuf = InvalidBuffer;
440 so->hashso_bucket_buf = InvalidBuffer;
441 so->hashso_split_bucket_buf = InvalidBuffer;
442 /* set position invalid (this will cause _hash_first call) */
443 ItemPointerSetInvalid(&(so->hashso_curpos));
444 ItemPointerSetInvalid(&(so->hashso_heappos));
446 so->hashso_buc_populated = false;
447 so->hashso_buc_split = false;
455 * hashrescan() -- rescan an index relation
458 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
459 ScanKey orderbys, int norderbys)
461 HashScanOpaque so = (HashScanOpaque) scan->opaque;
462 Relation rel = scan->indexRelation;
464 _hash_dropscanbuf(rel, so);
466 /* set position invalid (this will cause _hash_first call) */
467 ItemPointerSetInvalid(&(so->hashso_curpos));
468 ItemPointerSetInvalid(&(so->hashso_heappos));
470 /* Update scan key, if a new one is given */
471 if (scankey && scan->numberOfKeys > 0)
473 memmove(scan->keyData,
475 scan->numberOfKeys * sizeof(ScanKeyData));
478 so->hashso_buc_populated = false;
479 so->hashso_buc_split = false;
483 * hashendscan() -- close down a scan
486 hashendscan(IndexScanDesc scan)
488 HashScanOpaque so = (HashScanOpaque) scan->opaque;
489 Relation rel = scan->indexRelation;
491 _hash_dropscanbuf(rel, so);
498 * Bulk deletion of all index entries pointing to a set of heap tuples.
499 * The set of target tuples is specified via a callback routine that tells
500 * whether any given heap tuple (identified by ItemPointer) is being deleted.
502 * This function also deletes the tuples that are moved by split to other
505 * Result: a palloc'd struct containing statistical info for VACUUM displays.
507 IndexBulkDeleteResult *
508 hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
509 IndexBulkDeleteCallback callback, void *callback_state)
511 Relation rel = info->index;
512 double tuples_removed;
513 double num_index_tuples;
515 Bucket orig_maxbucket;
516 Bucket cur_maxbucket;
518 Buffer metabuf = InvalidBuffer;
520 HashMetaPage cachedmetap;
523 num_index_tuples = 0;
526 * We need a copy of the metapage so that we can use its hashm_spares[]
527 * values to compute bucket page addresses, but a cached copy should be
528 * good enough. (If not, we'll detect that further down and refresh the
529 * cache as necessary.)
531 cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
532 Assert(cachedmetap != NULL);
534 orig_maxbucket = cachedmetap->hashm_maxbucket;
535 orig_ntuples = cachedmetap->hashm_ntuples;
537 /* Scan the buckets that we know exist */
539 cur_maxbucket = orig_maxbucket;
542 while (cur_bucket <= cur_maxbucket)
544 BlockNumber bucket_blkno;
548 HashPageOpaque bucket_opaque;
550 bool split_cleanup = false;
552 /* Get address of bucket's start page */
553 bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
555 blkno = bucket_blkno;
558 * We need to acquire a cleanup lock on the primary bucket page to out
559 * wait concurrent scans before deleting the dead tuples.
561 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
562 LockBufferForCleanup(buf);
563 _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
565 page = BufferGetPage(buf);
566 bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
569 * If the bucket contains tuples that are moved by split, then we need
570 * to delete such tuples. We can't delete such tuples if the split
571 * operation on bucket is not finished as those are needed by scans.
573 if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
574 H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
576 split_cleanup = true;
579 * This bucket might have been split since we last held a lock on
580 * the metapage. If so, hashm_maxbucket, hashm_highmask and
581 * hashm_lowmask might be old enough to cause us to fail to remove
582 * tuples left behind by the most recent split. To prevent that,
583 * now that the primary page of the target bucket has been locked
584 * (and thus can't be further split), check whether we need to
585 * update our cached metapage data.
587 * NB: The check for InvalidBlockNumber is only needed for
588 * on-disk compatibility with indexes created before we started
589 * storing hashm_maxbucket in the primary page's hasho_prevblkno.
591 if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber &&
592 bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
594 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
595 Assert(cachedmetap != NULL);
601 hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
602 cachedmetap->hashm_maxbucket,
603 cachedmetap->hashm_highmask,
604 cachedmetap->hashm_lowmask, &tuples_removed,
605 &num_index_tuples, split_cleanup,
606 callback, callback_state);
608 _hash_dropbuf(rel, bucket_buf);
610 /* Advance to next bucket */
614 if (BufferIsInvalid(metabuf))
615 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
617 /* Write-lock metapage and check for split since we started */
618 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
619 metap = HashPageGetMeta(BufferGetPage(metabuf));
621 if (cur_maxbucket != metap->hashm_maxbucket)
623 /* There's been a split, so process the additional bucket(s) */
624 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
625 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
626 Assert(cachedmetap != NULL);
627 cur_maxbucket = cachedmetap->hashm_maxbucket;
631 /* Okay, we're really done. Update tuple count in metapage. */
632 START_CRIT_SECTION();
634 if (orig_maxbucket == metap->hashm_maxbucket &&
635 orig_ntuples == metap->hashm_ntuples)
638 * No one has split or inserted anything since start of scan, so
639 * believe our count as gospel.
641 metap->hashm_ntuples = num_index_tuples;
646 * Otherwise, our count is untrustworthy since we may have
647 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
648 * (Note: we still return estimated_count = false, because using this
649 * count is better than not updating reltuples at all.)
651 if (metap->hashm_ntuples > tuples_removed)
652 metap->hashm_ntuples -= tuples_removed;
654 metap->hashm_ntuples = 0;
655 num_index_tuples = metap->hashm_ntuples;
658 MarkBufferDirty(metabuf);
661 if (RelationNeedsWAL(rel))
663 xl_hash_update_meta_page xlrec;
666 xlrec.ntuples = metap->hashm_ntuples;
669 XLogRegisterData((char *) &xlrec, sizeof(SizeOfHashUpdateMetaPage));
671 XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
673 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
674 PageSetLSN(BufferGetPage(metabuf), recptr);
679 _hash_relbuf(rel, metabuf);
681 /* return statistics */
683 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
684 stats->estimated_count = false;
685 stats->num_index_tuples = num_index_tuples;
686 stats->tuples_removed += tuples_removed;
687 /* hashvacuumcleanup will fill in num_pages */
693 * Post-VACUUM cleanup.
695 * Result: a palloc'd struct containing statistical info for VACUUM displays.
697 IndexBulkDeleteResult *
698 hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
700 Relation rel = info->index;
701 BlockNumber num_pages;
703 /* If hashbulkdelete wasn't called, return NULL signifying no change */
704 /* Note: this covers the analyze_only case too */
708 /* update statistics */
709 num_pages = RelationGetNumberOfBlocks(rel);
710 stats->num_pages = num_pages;
716 * Helper function to perform deletion of index entries from a bucket.
718 * This function expects that the caller has acquired a cleanup lock on the
719 * primary bucket page, and will return with a write lock again held on the
720 * primary bucket page. The lock won't necessarily be held continuously,
721 * though, because we'll release it when visiting overflow pages.
723 * It would be very bad if this function cleaned a page while some other
724 * backend was in the midst of scanning it, because hashgettuple assumes
725 * that the next valid TID will be greater than or equal to the current
726 * valid TID. There can't be any concurrent scans in progress when we first
727 * enter this function because of the cleanup lock we hold on the primary
728 * bucket page, but as soon as we release that lock, there might be. We
729 * handle that by conspiring to prevent those scans from passing our cleanup
730 * scan. To do that, we lock the next page in the bucket chain before
731 * releasing the lock on the previous page. (This type of lock chaining is
732 * not ideal, so we might want to look for a better solution at some point.)
734 * We need to retain a pin on the primary bucket to ensure that no concurrent
738 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
739 BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
740 uint32 maxbucket, uint32 highmask, uint32 lowmask,
741 double *tuples_removed, double *num_index_tuples,
743 IndexBulkDeleteCallback callback, void *callback_state)
747 Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
748 bool bucket_dirty = false;
750 blkno = bucket_blkno;
754 new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
757 /* Scan each page in bucket */
760 HashPageOpaque opaque;
762 OffsetNumber maxoffno;
765 OffsetNumber deletable[MaxOffsetNumber];
767 bool retain_pin = false;
769 vacuum_delay_point();
771 page = BufferGetPage(buf);
772 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
774 /* Scan each tuple in page */
775 maxoffno = PageGetMaxOffsetNumber(page);
776 for (offno = FirstOffsetNumber;
778 offno = OffsetNumberNext(offno))
783 bool kill_tuple = false;
785 itup = (IndexTuple) PageGetItem(page,
786 PageGetItemId(page, offno));
787 htup = &(itup->t_tid);
790 * To remove the dead tuples, we strictly want to rely on results
791 * of callback function. refer btvacuumpage for detailed reason.
793 if (callback && callback(htup, callback_state))
797 *tuples_removed += 1;
799 else if (split_cleanup)
801 /* delete the tuples that are moved by split. */
802 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
806 /* mark the item for deletion */
807 if (bucket != cur_bucket)
810 * We expect tuples to either belong to curent bucket or
811 * new_bucket. This is ensured because we don't allow
812 * further splits from bucket that contains garbage. See
813 * comments in _hash_expandtable.
815 Assert(bucket == new_bucket);
822 /* mark the item for deletion */
823 deletable[ndeletable++] = offno;
827 /* we're keeping it, so count it */
828 if (num_index_tuples)
829 *num_index_tuples += 1;
833 /* retain the pin on primary bucket page till end of bucket scan */
834 if (blkno == bucket_blkno)
839 blkno = opaque->hasho_nextblkno;
842 * Apply deletions, advance to next page and write page if needed.
846 /* No ereport(ERROR) until changes are logged */
847 START_CRIT_SECTION();
849 PageIndexMultiDelete(page, deletable, ndeletable);
851 MarkBufferDirty(buf);
854 if (RelationNeedsWAL(rel))
856 xl_hash_delete xlrec;
859 xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
862 XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
865 * bucket buffer needs to be registered to ensure that we can
866 * acquire a cleanup lock on it during replay.
868 if (!xlrec.is_primary_bucket_page)
869 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
871 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
872 XLogRegisterBufData(1, (char *) deletable,
873 ndeletable * sizeof(OffsetNumber));
875 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
876 PageSetLSN(BufferGetPage(buf), recptr);
882 /* bail out if there are no more pages to scan. */
883 if (!BlockNumberIsValid(blkno))
886 next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
891 * release the lock on previous page after acquiring the lock on next
895 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
897 _hash_relbuf(rel, buf);
903 * lock the bucket page to clear the garbage flag and squeeze the bucket.
904 * if the current buffer is same as bucket buffer, then we already have
905 * lock on bucket page.
907 if (buf != bucket_buf)
909 _hash_relbuf(rel, buf);
910 LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
914 * Clear the garbage flag from bucket after deleting the tuples that are
915 * moved by split. We purposefully clear the flag before squeeze bucket,
916 * so that after restart, vacuum shouldn't again try to delete the moved
921 HashPageOpaque bucket_opaque;
924 page = BufferGetPage(bucket_buf);
925 bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
927 /* No ereport(ERROR) until changes are logged */
928 START_CRIT_SECTION();
930 bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
931 MarkBufferDirty(bucket_buf);
934 if (RelationNeedsWAL(rel))
939 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
941 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
942 PageSetLSN(page, recptr);
949 * If we have deleted anything, try to compact free space. For squeezing
950 * the bucket, we must have a cleanup lock, else it can impact the
951 * ordering of tuples for a scan that has started before it.
953 if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
954 _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
957 LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);