1 /*-------------------------------------------------------------------------
4 * Implementation of Margo Seltzer's Hashing package for postgres.
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hash.c
14 * This file contains only the public interface routines.
16 *-------------------------------------------------------------------------
21 #include "access/hash.h"
22 #include "access/relscan.h"
23 #include "catalog/index.h"
24 #include "commands/vacuum.h"
25 #include "optimizer/cost.h"
26 #include "optimizer/plancat.h"
27 #include "storage/bufmgr.h"
28 #include "utils/rel.h"
31 /* Working state for hashbuild and its callback */
34 HSpool *spool; /* NULL if not using spooling */
35 double indtuples; /* # tuples accepted into index */
38 static void hashbuildCallback(Relation index,
47 * hashbuild() -- build a new hash index.
50 hashbuild(PG_FUNCTION_ARGS)
52 Relation heap = (Relation) PG_GETARG_POINTER(0);
53 Relation index = (Relation) PG_GETARG_POINTER(1);
54 IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
55 IndexBuildResult *result;
60 HashBuildState buildstate;
63 * We expect to be called exactly once for any index relation. If that's
64 * not the case, big trouble's what we have.
66 if (RelationGetNumberOfBlocks(index) != 0)
67 elog(ERROR, "index \"%s\" already contains data",
68 RelationGetRelationName(index));
70 /* Estimate the number of rows currently present in the table */
71 estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
73 /* Initialize the hash index metadata page and initial buckets */
74 num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
77 * If we just insert the tuples into the index in scan order, then
78 * (assuming their hash codes are pretty random) there will be no locality
79 * of access to the index, and if the index is bigger than available RAM
80 * then we'll thrash horribly. To prevent that scenario, we can sort the
81 * tuples by (expected) bucket number. However, such a sort is useless
82 * overhead when the index does fit in RAM. We choose to sort if the
83 * initial index size exceeds NBuffers.
85 * NOTE: this test will need adjustment if a bucket is ever different from
88 if (num_buckets >= (uint32) NBuffers)
89 buildstate.spool = _h_spoolinit(heap, index, num_buckets);
91 buildstate.spool = NULL;
93 /* prepare to build the index */
94 buildstate.indtuples = 0;
96 /* do the heap scan */
97 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
98 hashbuildCallback, (void *) &buildstate);
100 if (buildstate.spool)
102 /* sort the tuples and insert them into the index */
103 _h_indexbuild(buildstate.spool);
104 _h_spooldestroy(buildstate.spool);
110 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
112 result->heap_tuples = reltuples;
113 result->index_tuples = buildstate.indtuples;
115 PG_RETURN_POINTER(result);
119 * hashbuildempty() -- build an empty hash index in the initialization fork
122 hashbuildempty(PG_FUNCTION_ARGS)
124 Relation index = (Relation) PG_GETARG_POINTER(0);
126 _hash_metapinit(index, 0, INIT_FORKNUM);
132 * Per-tuple callback from IndexBuildHeapScan
135 hashbuildCallback(Relation index,
142 HashBuildState *buildstate = (HashBuildState *) state;
145 /* Hash indexes don't index nulls, see notes in hashinsert */
149 /* Either spool the tuple for sorting, or just put it into the index */
150 if (buildstate->spool)
151 _h_spool(buildstate->spool, &htup->t_self, values, isnull);
154 /* form an index tuple and point it at the heap tuple */
155 itup = _hash_form_tuple(index, values, isnull);
156 itup->t_tid = htup->t_self;
157 _hash_doinsert(index, itup);
161 buildstate->indtuples += 1;
165 * hashinsert() -- insert an index tuple into a hash table.
167 * Hash on the heap tuple's key, form an index tuple with hash code.
168 * Find the appropriate location for the new tuple, and put it there.
171 hashinsert(PG_FUNCTION_ARGS)
173 Relation rel = (Relation) PG_GETARG_POINTER(0);
174 Datum *values = (Datum *) PG_GETARG_POINTER(1);
175 bool *isnull = (bool *) PG_GETARG_POINTER(2);
176 ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
179 Relation heapRel = (Relation) PG_GETARG_POINTER(4);
180 IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
185 * If the single index key is null, we don't insert it into the index.
186 * Hash tables support scans on '='. Relational algebra says that A = B
187 * returns null if either A or B is null. This means that no
188 * qualification used in an index scan could ever return true on a null
189 * attribute. It also means that indices can't be used by ISNULL or
190 * NOTNULL scans, but that's an artifact of the strategy map architecture
191 * chosen in 1986, not of the way nulls are handled here.
194 PG_RETURN_BOOL(false);
196 /* generate an index tuple */
197 itup = _hash_form_tuple(rel, values, isnull);
198 itup->t_tid = *ht_ctid;
200 _hash_doinsert(rel, itup);
204 PG_RETURN_BOOL(false);
209 * hashgettuple() -- Get the next tuple in the scan.
212 hashgettuple(PG_FUNCTION_ARGS)
214 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
215 ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
216 HashScanOpaque so = (HashScanOpaque) scan->opaque;
217 Relation rel = scan->indexRelation;
224 /* Hash indexes are always lossy since we store only the hash code */
225 scan->xs_recheck = true;
228 * We hold pin but not lock on current buffer while outside the hash AM.
229 * Reacquire the read lock here.
231 if (BufferIsValid(so->hashso_curbuf))
232 _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
235 * If we've already initialized this scan, we can just advance it in the
236 * appropriate direction. If we haven't done so yet, we call a routine to
237 * get the first item in the scan.
239 current = &(so->hashso_curpos);
240 if (ItemPointerIsValid(current))
243 * An insertion into the current index page could have happened while
244 * we didn't have read lock on it. Re-find our position by looking
245 * for the TID we previously returned. (Because we hold share lock on
246 * the bucket, no deletions or splits could have occurred; therefore
247 * we can expect that the TID still exists in the current index page,
248 * at an offset >= where we were.)
250 OffsetNumber maxoffnum;
252 buf = so->hashso_curbuf;
253 Assert(BufferIsValid(buf));
254 page = BufferGetPage(buf);
255 maxoffnum = PageGetMaxOffsetNumber(page);
256 for (offnum = ItemPointerGetOffsetNumber(current);
258 offnum = OffsetNumberNext(offnum))
262 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
263 if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
266 if (offnum > maxoffnum)
267 elog(ERROR, "failed to re-find scan position within index \"%s\"",
268 RelationGetRelationName(rel));
269 ItemPointerSetOffsetNumber(current, offnum);
272 * Check to see if we should kill the previously-fetched tuple.
274 if (scan->kill_prior_tuple)
277 * Yes, so mark it by setting the LP_DEAD state in the item flags.
279 ItemIdMarkDead(PageGetItemId(page, offnum));
282 * Since this can be redone later if needed, mark as a hint.
284 MarkBufferDirtyHint(buf, true);
288 * Now continue the scan.
290 res = _hash_next(scan, dir);
293 res = _hash_first(scan, dir);
296 * Skip killed tuples if asked to.
298 if (scan->ignore_killed_tuples)
302 offnum = ItemPointerGetOffsetNumber(current);
303 page = BufferGetPage(so->hashso_curbuf);
304 if (!ItemIdIsDead(PageGetItemId(page, offnum)))
306 res = _hash_next(scan, dir);
310 /* Release read lock on current buffer, but keep it pinned */
311 if (BufferIsValid(so->hashso_curbuf))
312 _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
314 /* Return current heap TID on success */
315 scan->xs_ctup.t_self = so->hashso_heappos;
322 * hashgetbitmap() -- get all tuples at once
325 hashgetbitmap(PG_FUNCTION_ARGS)
327 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
328 TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
329 HashScanOpaque so = (HashScanOpaque) scan->opaque;
333 res = _hash_first(scan, ForwardScanDirection);
340 * Skip killed tuples if asked to.
342 if (scan->ignore_killed_tuples)
347 offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
348 page = BufferGetPage(so->hashso_curbuf);
349 add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
354 /* Save tuple ID, and continue scanning */
357 /* Note we mark the tuple ID as requiring recheck */
358 tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
362 res = _hash_next(scan, ForwardScanDirection);
365 PG_RETURN_INT64(ntids);
370 * hashbeginscan() -- start a scan on a hash index
373 hashbeginscan(PG_FUNCTION_ARGS)
375 Relation rel = (Relation) PG_GETARG_POINTER(0);
376 int nkeys = PG_GETARG_INT32(1);
377 int norderbys = PG_GETARG_INT32(2);
381 /* no order by operators allowed */
382 Assert(norderbys == 0);
384 scan = RelationGetIndexScan(rel, nkeys, norderbys);
386 so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
387 so->hashso_bucket_valid = false;
388 so->hashso_bucket_blkno = 0;
389 so->hashso_curbuf = InvalidBuffer;
390 /* set position invalid (this will cause _hash_first call) */
391 ItemPointerSetInvalid(&(so->hashso_curpos));
392 ItemPointerSetInvalid(&(so->hashso_heappos));
396 /* register scan in case we change pages it's using */
399 PG_RETURN_POINTER(scan);
403 * hashrescan() -- rescan an index relation
406 hashrescan(PG_FUNCTION_ARGS)
408 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
409 ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
411 /* remaining arguments are ignored */
412 HashScanOpaque so = (HashScanOpaque) scan->opaque;
413 Relation rel = scan->indexRelation;
415 /* release any pin we still hold */
416 if (BufferIsValid(so->hashso_curbuf))
417 _hash_dropbuf(rel, so->hashso_curbuf);
418 so->hashso_curbuf = InvalidBuffer;
420 /* release lock on bucket, too */
421 if (so->hashso_bucket_blkno)
422 _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
423 so->hashso_bucket_blkno = 0;
425 /* set position invalid (this will cause _hash_first call) */
426 ItemPointerSetInvalid(&(so->hashso_curpos));
427 ItemPointerSetInvalid(&(so->hashso_heappos));
429 /* Update scan key, if a new one is given */
430 if (scankey && scan->numberOfKeys > 0)
432 memmove(scan->keyData,
434 scan->numberOfKeys * sizeof(ScanKeyData));
435 so->hashso_bucket_valid = false;
442 * hashendscan() -- close down a scan
445 hashendscan(PG_FUNCTION_ARGS)
447 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
448 HashScanOpaque so = (HashScanOpaque) scan->opaque;
449 Relation rel = scan->indexRelation;
451 /* don't need scan registered anymore */
452 _hash_dropscan(scan);
454 /* release any pin we still hold */
455 if (BufferIsValid(so->hashso_curbuf))
456 _hash_dropbuf(rel, so->hashso_curbuf);
457 so->hashso_curbuf = InvalidBuffer;
459 /* release lock on bucket, too */
460 if (so->hashso_bucket_blkno)
461 _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
462 so->hashso_bucket_blkno = 0;
471 * hashmarkpos() -- save current scan position
474 hashmarkpos(PG_FUNCTION_ARGS)
476 elog(ERROR, "hash does not support mark/restore");
481 * hashrestrpos() -- restore scan to last saved position
484 hashrestrpos(PG_FUNCTION_ARGS)
486 elog(ERROR, "hash does not support mark/restore");
491 * Bulk deletion of all index entries pointing to a set of heap tuples.
492 * The set of target tuples is specified via a callback routine that tells
493 * whether any given heap tuple (identified by ItemPointer) is being deleted.
495 * Result: a palloc'd struct containing statistical info for VACUUM displays.
498 hashbulkdelete(PG_FUNCTION_ARGS)
500 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
501 IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
502 IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
503 void *callback_state = (void *) PG_GETARG_POINTER(3);
504 Relation rel = info->index;
505 double tuples_removed;
506 double num_index_tuples;
508 Bucket orig_maxbucket;
509 Bucket cur_maxbucket;
513 HashMetaPageData local_metapage;
516 num_index_tuples = 0;
519 * Read the metapage to fetch original bucket and tuple counts. Also, we
520 * keep a copy of the last-seen metapage so that we can use its
521 * hashm_spares[] values to compute bucket page addresses. This is a bit
522 * hokey but perfectly safe, since the interesting entries in the spares
523 * array cannot change under us; and it beats rereading the metapage for
526 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
527 metap = HashPageGetMeta(BufferGetPage(metabuf));
528 orig_maxbucket = metap->hashm_maxbucket;
529 orig_ntuples = metap->hashm_ntuples;
530 memcpy(&local_metapage, metap, sizeof(local_metapage));
531 _hash_relbuf(rel, metabuf);
533 /* Scan the buckets that we know exist */
535 cur_maxbucket = orig_maxbucket;
538 while (cur_bucket <= cur_maxbucket)
540 BlockNumber bucket_blkno;
542 bool bucket_dirty = false;
544 /* Get address of bucket's start page */
545 bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
547 /* Exclusive-lock the bucket so we can shrink it */
548 _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
550 /* Shouldn't have any active scans locally, either */
551 if (_hash_has_active_scan(rel, cur_bucket))
552 elog(ERROR, "hash index has active scan during VACUUM");
554 /* Scan each page in bucket */
555 blkno = bucket_blkno;
556 while (BlockNumberIsValid(blkno))
560 HashPageOpaque opaque;
562 OffsetNumber maxoffno;
563 OffsetNumber deletable[MaxOffsetNumber];
566 vacuum_delay_point();
568 buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
569 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
571 page = BufferGetPage(buf);
572 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
573 Assert(opaque->hasho_bucket == cur_bucket);
575 /* Scan each tuple in page */
576 maxoffno = PageGetMaxOffsetNumber(page);
577 for (offno = FirstOffsetNumber;
579 offno = OffsetNumberNext(offno))
584 itup = (IndexTuple) PageGetItem(page,
585 PageGetItemId(page, offno));
586 htup = &(itup->t_tid);
587 if (callback(htup, callback_state))
589 /* mark the item for deletion */
590 deletable[ndeletable++] = offno;
594 num_index_tuples += 1;
598 * Apply deletions and write page if needed, advance to next page.
600 blkno = opaque->hasho_nextblkno;
604 PageIndexMultiDelete(page, deletable, ndeletable);
605 _hash_wrtbuf(rel, buf);
609 _hash_relbuf(rel, buf);
612 /* If we deleted anything, try to compact free space */
614 _hash_squeezebucket(rel, cur_bucket, bucket_blkno,
617 /* Release bucket lock */
618 _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
620 /* Advance to next bucket */
624 /* Write-lock metapage and check for split since we started */
625 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
626 metap = HashPageGetMeta(BufferGetPage(metabuf));
628 if (cur_maxbucket != metap->hashm_maxbucket)
630 /* There's been a split, so process the additional bucket(s) */
631 cur_maxbucket = metap->hashm_maxbucket;
632 memcpy(&local_metapage, metap, sizeof(local_metapage));
633 _hash_relbuf(rel, metabuf);
637 /* Okay, we're really done. Update tuple count in metapage. */
639 if (orig_maxbucket == metap->hashm_maxbucket &&
640 orig_ntuples == metap->hashm_ntuples)
643 * No one has split or inserted anything since start of scan, so
644 * believe our count as gospel.
646 metap->hashm_ntuples = num_index_tuples;
651 * Otherwise, our count is untrustworthy since we may have
652 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
653 * (Note: we still return estimated_count = false, because using this
654 * count is better than not updating reltuples at all.)
656 if (metap->hashm_ntuples > tuples_removed)
657 metap->hashm_ntuples -= tuples_removed;
659 metap->hashm_ntuples = 0;
660 num_index_tuples = metap->hashm_ntuples;
663 _hash_wrtbuf(rel, metabuf);
665 /* return statistics */
667 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
668 stats->estimated_count = false;
669 stats->num_index_tuples = num_index_tuples;
670 stats->tuples_removed += tuples_removed;
671 /* hashvacuumcleanup will fill in num_pages */
673 PG_RETURN_POINTER(stats);
677 * Post-VACUUM cleanup.
679 * Result: a palloc'd struct containing statistical info for VACUUM displays.
682 hashvacuumcleanup(PG_FUNCTION_ARGS)
684 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
685 IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
686 Relation rel = info->index;
687 BlockNumber num_pages;
689 /* If hashbulkdelete wasn't called, return NULL signifying no change */
690 /* Note: this covers the analyze_only case too */
692 PG_RETURN_POINTER(NULL);
694 /* update statistics */
695 num_pages = RelationGetNumberOfBlocks(rel);
696 stats->num_pages = num_pages;
698 PG_RETURN_POINTER(stats);
703 hash_redo(XLogReaderState *record)
705 elog(PANIC, "hash_redo: unimplemented");