1 /*-------------------------------------------------------------------------
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * A partitioned table uses a spinlock to guard changes of those two fields.
19 * This lets any subset of the hash buckets be treated as a separately
20 * lockable partition. We expect callers to use the low-order bits of a
21 * lookup key's hash value as a partition number --- this will work because
22 * of the way calc_bucket() maps hash values to bucket numbers.
24 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
25 * Portions Copyright (c) 1994, Regents of the University of California
29 * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.79 2009/01/01 17:23:51 momjian Exp $
31 *-------------------------------------------------------------------------
37 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
38 * Coded into C, with minor code improvements, and with hsearch(3) interface,
39 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
40 * also, hcreate/hdestroy routines added to simulate hsearch(3).
42 * These routines simulate hsearch(3) and family, with the important
43 * difference that the hash table is dynamic - can grow indefinitely
44 * beyond its original size (as supplied to hcreate()).
46 * Performance appears to be comparable to that of hsearch(3).
47 * The 'source-code' options referred to in hsearch(3)'s 'man' page
48 * are not implemented; otherwise functionality is identical.
50 * Compilation controls:
51 * DEBUG controls some informative traces, mainly for debugging.
52 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
53 * when combined with HASH_DEBUG, these are displayed by hdestroy().
55 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
56 * concatenation property, in probably unnecessary code 'optimisation'.
58 * Modified margo@postgres.berkeley.edu February 1990
59 * added multiple table interface
60 * Modified by sullivan@postgres.berkeley.edu April 1990
61 * changed ctl structure for shared memory
66 #include "access/xact.h"
67 #include "storage/shmem.h"
68 #include "storage/spin.h"
69 #include "utils/dynahash.h"
70 #include "utils/memutils.h"
76 * A hash table has a top-level "directory", each of whose entries points
77 * to a "segment" of ssize bucket headers. The maximum number of hash
78 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
79 * the number of records in the table can be larger, but we don't want a
80 * whole lot of records per bucket or performance goes down.
82 * In a hash table allocated in shared memory, the directory cannot be
83 * expanded because it must stay at a fixed address. The directory size
84 * should be selected using hash_select_dirsize (and you'd better have
85 * a good idea of the maximum number of entries!). For non-shared hash
86 * tables, the initial directory size can be left at the default.
88 #define DEF_SEGSIZE 256
89 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
90 #define DEF_DIRSIZE 256
91 #define DEF_FFACTOR 1 /* default fill factor */
94 /* A hash bucket is a linked list of HASHELEMENTs */
95 typedef HASHELEMENT *HASHBUCKET;
97 /* A hash segment is an array of bucket headers */
98 typedef HASHBUCKET *HASHSEGMENT;
101 * Header structure for a hash table --- contains all changeable info
103 * In a shared-memory hash table, the HASHHDR is in shared memory, while
104 * each backend has a local HTAB struct. For a non-shared table, there isn't
105 * any functional difference between HASHHDR and HTAB, but we separate them
106 * anyway to share code between shared and non-shared tables.
110 /* In a partitioned table, take this lock to touch nentries or freeList */
111 slock_t mutex; /* unused if not partitioned table */
113 /* These fields change during entry addition/deletion */
114 long nentries; /* number of entries in hash table */
115 HASHELEMENT *freeList; /* linked list of free elements */
117 /* These fields can change, but not in a partitioned table */
118 /* Also, dsize can't change in a shared table, even if unpartitioned */
119 long dsize; /* directory size */
120 long nsegs; /* number of allocated segments (<= dsize) */
121 uint32 max_bucket; /* ID of maximum bucket in use */
122 uint32 high_mask; /* mask to modulo into entire table */
123 uint32 low_mask; /* mask to modulo into lower half of table */
125 /* These fields are fixed at hashtable creation */
126 Size keysize; /* hash key length in bytes */
127 Size entrysize; /* total user element size in bytes */
128 long num_partitions; /* # partitions (must be power of 2), or 0 */
129 long ffactor; /* target fill factor */
130 long max_dsize; /* 'dsize' limit if directory is fixed size */
131 long ssize; /* segment size --- must be power of 2 */
132 int sshift; /* segment shift = log2(ssize) */
133 int nelem_alloc; /* number of entries to allocate at once */
135 #ifdef HASH_STATISTICS
138 * Count statistics here. NB: stats code doesn't bother with mutex, so
139 * counts could be corrupted a bit in a partitioned table.
146 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
149 * Top control structure for a hashtable --- in a shared table, each backend
150 * has its own copy (OK since no fields change at runtime)
154 HASHHDR *hctl; /* => shared control information */
155 HASHSEGMENT *dir; /* directory of segment starts */
156 HashValueFunc hash; /* hash function */
157 HashCompareFunc match; /* key comparison function */
158 HashCopyFunc keycopy; /* key copying function */
159 HashAllocFunc alloc; /* memory allocator */
160 MemoryContext hcxt; /* memory context if default allocator used */
161 char *tabname; /* table name (for error messages) */
162 bool isshared; /* true if table is in shared memory */
164 /* freezing a shared table isn't allowed, so we can keep state here */
165 bool frozen; /* true = no more inserts allowed */
167 /* We keep local copies of these fixed values to reduce contention */
168 Size keysize; /* hash key length in bytes */
169 long ssize; /* segment size --- must be power of 2 */
170 int sshift; /* segment shift = log2(ssize) */
174 * Key (also entry) part of a HASHELEMENT
176 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
179 * Fast MOD arithmetic, assuming that y is a power of 2 !
181 #define MOD(x,y) ((x) & ((y)-1))
184 static long hash_accesses,
190 * Private function prototypes
192 static void *DynaHashAlloc(Size size);
193 static HASHSEGMENT seg_alloc(HTAB *hashp);
194 static bool element_alloc(HTAB *hashp, int nelem);
195 static bool dir_realloc(HTAB *hashp);
196 static bool expand_table(HTAB *hashp);
197 static HASHBUCKET get_hash_entry(HTAB *hashp);
198 static void hdefault(HTAB *hashp);
199 static int choose_nelem_alloc(Size entrysize);
200 static bool init_htab(HTAB *hashp, long nelem);
201 static void hash_corrupted(HTAB *hashp);
202 static void register_seq_scan(HTAB *hashp);
203 static void deregister_seq_scan(HTAB *hashp);
204 static bool has_seq_scans(HTAB *hashp);
208 * memory allocation support
210 static MemoryContext CurrentDynaHashCxt = NULL;
213 DynaHashAlloc(Size size)
215 Assert(MemoryContextIsValid(CurrentDynaHashCxt));
216 return MemoryContextAlloc(CurrentDynaHashCxt, size);
221 * HashCompareFunc for string keys
223 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
224 * bytes, so we can only compare that many ... hence strncmp is almost but
225 * not quite the right thing.
228 string_compare(const char *key1, const char *key2, Size keysize)
230 return strncmp(key1, key2, keysize - 1);
234 /************************** CREATE ROUTINES **********************/
237 * hash_create -- create a new dynamic hash table
239 * tabname: a name for the table (for debugging purposes)
240 * nelem: maximum number of elements expected
241 * *info: additional table parameters, as indicated by flags
242 * flags: bitmask indicating which parameters to take from *info
244 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
245 * estimate, since we can't expand the table on the fly. But an unshared
246 * hashtable can be expanded on-the-fly, so it's better for nelem to be
247 * on the small side and let the table grow if it's exceeded. An overly
248 * large nelem will penalize hash_seq_search speed without buying much.
251 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
257 * For shared hash tables, we have a local hash header (HTAB struct) that
258 * we allocate in TopMemoryContext; all else is in shared memory.
260 * For non-shared hash tables, everything including the hash header is in
261 * a memory context created specially for the hash table --- this makes
262 * hash_destroy very simple. The memory context is made a child of either
263 * a context specified by the caller, or TopMemoryContext if nothing is
266 if (flags & HASH_SHARED_MEM)
268 /* Set up to allocate the hash header */
269 CurrentDynaHashCxt = TopMemoryContext;
273 /* Create the hash table's private memory context */
274 if (flags & HASH_CONTEXT)
275 CurrentDynaHashCxt = info->hcxt;
277 CurrentDynaHashCxt = TopMemoryContext;
278 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
280 ALLOCSET_DEFAULT_MINSIZE,
281 ALLOCSET_DEFAULT_INITSIZE,
282 ALLOCSET_DEFAULT_MAXSIZE);
285 /* Initialize the hash header, plus a copy of the table name */
286 hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1);
287 MemSet(hashp, 0, sizeof(HTAB));
289 hashp->tabname = (char *) (hashp + 1);
290 strcpy(hashp->tabname, tabname);
292 if (flags & HASH_FUNCTION)
293 hashp->hash = info->hash;
295 hashp->hash = string_hash; /* default hash function */
298 * If you don't specify a match function, it defaults to string_compare if
299 * you used string_hash (either explicitly or by default) and to memcmp
300 * otherwise. (Prior to PostgreSQL 7.4, memcmp was always used.)
302 if (flags & HASH_COMPARE)
303 hashp->match = info->match;
304 else if (hashp->hash == string_hash)
305 hashp->match = (HashCompareFunc) string_compare;
307 hashp->match = memcmp;
310 * Similarly, the key-copying function defaults to strlcpy or memcpy.
312 if (flags & HASH_KEYCOPY)
313 hashp->keycopy = info->keycopy;
314 else if (hashp->hash == string_hash)
315 hashp->keycopy = (HashCopyFunc) strlcpy;
317 hashp->keycopy = memcpy;
319 if (flags & HASH_ALLOC)
320 hashp->alloc = info->alloc;
322 hashp->alloc = DynaHashAlloc;
324 if (flags & HASH_SHARED_MEM)
327 * ctl structure and directory are preallocated for shared memory
328 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
331 hashp->hctl = info->hctl;
332 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
334 hashp->isshared = true;
336 /* hash table already exists, we're just attaching to it */
337 if (flags & HASH_ATTACH)
339 /* make local copies of some heavily-used values */
341 hashp->keysize = hctl->keysize;
342 hashp->ssize = hctl->ssize;
343 hashp->sshift = hctl->sshift;
350 /* setup hash table defaults */
353 hashp->hcxt = CurrentDynaHashCxt;
354 hashp->isshared = false;
359 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
362 (errcode(ERRCODE_OUT_OF_MEMORY),
363 errmsg("out of memory")));
366 hashp->frozen = false;
372 if (flags & HASH_PARTITION)
374 /* Doesn't make sense to partition a local hash table */
375 Assert(flags & HASH_SHARED_MEM);
376 /* # of partitions had better be a power of 2 */
377 Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
379 hctl->num_partitions = info->num_partitions;
382 if (flags & HASH_SEGMENT)
384 hctl->ssize = info->ssize;
385 hctl->sshift = my_log2(info->ssize);
386 /* ssize had better be a power of 2 */
387 Assert(hctl->ssize == (1L << hctl->sshift));
389 if (flags & HASH_FFACTOR)
390 hctl->ffactor = info->ffactor;
393 * SHM hash tables have fixed directory size passed by the caller.
395 if (flags & HASH_DIRSIZE)
397 hctl->max_dsize = info->max_dsize;
398 hctl->dsize = info->dsize;
402 * hash table now allocates space for key and data but you have to say how
403 * much space to allocate
405 if (flags & HASH_ELEM)
407 Assert(info->entrysize >= info->keysize);
408 hctl->keysize = info->keysize;
409 hctl->entrysize = info->entrysize;
412 /* make local copies of heavily-used constant fields */
413 hashp->keysize = hctl->keysize;
414 hashp->ssize = hctl->ssize;
415 hashp->sshift = hctl->sshift;
417 /* Build the hash directory structure */
418 if (!init_htab(hashp, nelem))
419 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
422 * For a shared hash table, preallocate the requested number of elements.
423 * This reduces problems with run-time out-of-shared-memory conditions.
425 * For a non-shared hash table, preallocate the requested number of
426 * elements if it's less than our chosen nelem_alloc. This avoids wasting
427 * space if the caller correctly estimates a small table size.
429 if ((flags & HASH_SHARED_MEM) ||
430 nelem < hctl->nelem_alloc)
432 if (!element_alloc(hashp, (int) nelem))
434 (errcode(ERRCODE_OUT_OF_MEMORY),
435 errmsg("out of memory")));
442 * Set default HASHHDR parameters.
445 hdefault(HTAB *hashp)
447 HASHHDR *hctl = hashp->hctl;
449 MemSet(hctl, 0, sizeof(HASHHDR));
452 hctl->freeList = NULL;
454 hctl->dsize = DEF_DIRSIZE;
457 /* rather pointless defaults for key & entry size */
458 hctl->keysize = sizeof(char *);
459 hctl->entrysize = 2 * sizeof(char *);
461 hctl->num_partitions = 0; /* not partitioned */
463 hctl->ffactor = DEF_FFACTOR;
465 /* table has no fixed maximum size */
466 hctl->max_dsize = NO_MAX_DSIZE;
468 hctl->ssize = DEF_SEGSIZE;
469 hctl->sshift = DEF_SEGSIZE_SHIFT;
471 #ifdef HASH_STATISTICS
472 hctl->accesses = hctl->collisions = 0;
477 * Given the user-specified entry size, choose nelem_alloc, ie, how many
478 * elements to add to the hash table when we need more.
481 choose_nelem_alloc(Size entrysize)
487 /* Each element has a HASHELEMENT header plus user data. */
488 /* NB: this had better match element_alloc() */
489 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
492 * The idea here is to choose nelem_alloc at least 32, but round up so
493 * that the allocation request will be a power of 2 or just less. This
494 * makes little difference for hash tables in shared memory, but for hash
495 * tables managed by palloc, the allocation request will be rounded up to
496 * a power of 2 anyway. If we fail to take this into account, we'll waste
497 * as much as half the allocated space.
499 allocSize = 32 * 4; /* assume elementSize at least 8 */
503 nelem_alloc = allocSize / elementSize;
504 } while (nelem_alloc < 32);
510 * Compute derived fields of hctl and build the initial directory/segment
514 init_htab(HTAB *hashp, long nelem)
516 HASHHDR *hctl = hashp->hctl;
523 * initialize mutex if it's a partitioned table
525 if (IS_PARTITIONED(hctl))
526 SpinLockInit(&hctl->mutex);
529 * Divide number of elements by the fill factor to determine a desired
530 * number of buckets. Allocate space for the next greater power of two
533 lnbuckets = (nelem - 1) / hctl->ffactor + 1;
535 nbuckets = 1 << my_log2(lnbuckets);
538 * In a partitioned table, nbuckets must be at least equal to
539 * num_partitions; were it less, keys with apparently different partition
540 * numbers would map to the same bucket, breaking partition independence.
541 * (Normally nbuckets will be much bigger; this is just a safety check.)
543 while (nbuckets < hctl->num_partitions)
546 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
547 hctl->high_mask = (nbuckets << 1) - 1;
550 * Figure number of directory segments needed, round up to a power of 2
552 nsegs = (nbuckets - 1) / hctl->ssize + 1;
553 nsegs = 1 << my_log2(nsegs);
556 * Make sure directory is big enough. If pre-allocated directory is too
557 * small, choke (caller screwed up).
559 if (nsegs > hctl->dsize)
567 /* Allocate a directory */
570 CurrentDynaHashCxt = hashp->hcxt;
571 hashp->dir = (HASHSEGMENT *)
572 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
577 /* Allocate initial segments */
578 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
580 *segp = seg_alloc(hashp);
585 /* Choose number of entries to allocate at a time */
586 hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
589 fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n%s%ld\n",
590 "TABLE POINTER ", hashp,
591 "DIRECTORY SIZE ", hctl->dsize,
592 "SEGMENT SIZE ", hctl->ssize,
593 "SEGMENT SHIFT ", hctl->sshift,
594 "FILL FACTOR ", hctl->ffactor,
595 "MAX BUCKET ", hctl->max_bucket,
596 "HIGH MASK ", hctl->high_mask,
597 "LOW MASK ", hctl->low_mask,
598 "NSEGS ", hctl->nsegs,
599 "NENTRIES ", hctl->nentries);
605 * Estimate the space needed for a hashtable containing the given number
606 * of entries of given size.
607 * NOTE: this is used to estimate the footprint of hashtables in shared
608 * memory; therefore it does not count HTAB which is in local memory.
609 * NB: assumes that all hash structure parameters have default values!
612 hash_estimate_size(long num_entries, Size entrysize)
622 /* estimate number of buckets wanted */
623 nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
624 /* # of segments needed for nBuckets */
625 nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
626 /* directory entries */
627 nDirEntries = DEF_DIRSIZE;
628 while (nDirEntries < nSegments)
629 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
631 /* fixed control info */
632 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
634 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
636 size = add_size(size, mul_size(nSegments,
637 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
638 /* elements --- allocated in groups of choose_nelem_alloc() entries */
639 elementAllocCnt = choose_nelem_alloc(entrysize);
640 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
641 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
642 size = add_size(size,
643 mul_size(nElementAllocs,
644 mul_size(elementAllocCnt, elementSize)));
650 * Select an appropriate directory size for a hashtable with the given
651 * maximum number of entries.
652 * This is only needed for hashtables in shared memory, whose directories
653 * cannot be expanded dynamically.
654 * NB: assumes that all hash structure parameters have default values!
656 * XXX this had better agree with the behavior of init_htab()...
659 hash_select_dirsize(long num_entries)
665 /* estimate number of buckets wanted */
666 nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
667 /* # of segments needed for nBuckets */
668 nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
669 /* directory entries */
670 nDirEntries = DEF_DIRSIZE;
671 while (nDirEntries < nSegments)
672 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
678 * Compute the required initial memory allocation for a shared-memory
679 * hashtable with the given parameters. We need space for the HASHHDR
680 * and for the (non expansible) directory.
683 hash_get_shared_size(HASHCTL *info, int flags)
685 Assert(flags & HASH_DIRSIZE);
686 Assert(info->dsize == info->max_dsize);
687 return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
691 /********************** DESTROY ROUTINES ************************/
694 hash_destroy(HTAB *hashp)
698 /* allocation method must be one we know how to free, too */
699 Assert(hashp->alloc == DynaHashAlloc);
700 /* so this hashtable must have it's own context */
701 Assert(hashp->hcxt != NULL);
703 hash_stats("destroy", hashp);
706 * Free everything by destroying the hash table's memory context.
708 MemoryContextDelete(hashp->hcxt);
713 hash_stats(const char *where, HTAB *hashp)
716 fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
717 where, hashp->hctl->accesses, hashp->hctl->collisions);
719 fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
720 hashp->hctl->nentries, (long) hashp->hctl->keysize,
721 hashp->hctl->max_bucket, hashp->hctl->nsegs);
722 fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
723 where, hash_accesses, hash_collisions);
724 fprintf(stderr, "hash_stats: total expansions %ld\n",
729 /*******************************SEARCH ROUTINES *****************************/
733 * get_hash_value -- exported routine to calculate a key's hash value
735 * We export this because for partitioned tables, callers need to compute
736 * the partition number (from the low-order bits of the hash value) before
740 get_hash_value(HTAB *hashp, const void *keyPtr)
742 return hashp->hash(keyPtr, hashp->keysize);
745 /* Convert a hash value to a bucket number */
747 calc_bucket(HASHHDR *hctl, uint32 hash_val)
751 bucket = hash_val & hctl->high_mask;
752 if (bucket > hctl->max_bucket)
753 bucket = bucket & hctl->low_mask;
759 * hash_search -- look up key in table and perform action
760 * hash_search_with_hash_value -- same, with key's hash value already computed
763 * HASH_FIND: look up key in table
764 * HASH_ENTER: look up key in table, creating entry if not present
765 * HASH_ENTER_NULL: same, but return NULL if out of memory
766 * HASH_REMOVE: look up key in table, remove entry if present
768 * Return value is a pointer to the element found/entered/removed if any,
769 * or NULL if no match was found. (NB: in the case of the REMOVE action,
770 * the result is a dangling pointer that shouldn't be dereferenced!)
772 * HASH_ENTER will normally ereport a generic "out of memory" error if
773 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
774 * the same except it will return NULL if out of memory. Note that
775 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
776 * since palloc internally ereports on out-of-memory.
778 * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
779 * existing entry in the table, FALSE otherwise. This is needed in the
780 * HASH_ENTER case, but is redundant with the return value otherwise.
782 * For hash_search_with_hash_value, the hashvalue parameter must have been
783 * calculated with get_hash_value().
786 hash_search(HTAB *hashp,
791 return hash_search_with_hash_value(hashp,
793 hashp->hash(keyPtr, hashp->keysize),
799 hash_search_with_hash_value(HTAB *hashp,
805 HASHHDR *hctl = hashp->hctl;
811 HASHBUCKET currBucket;
812 HASHBUCKET *prevBucketPtr;
813 HashCompareFunc match;
821 * Do the initial lookup
823 bucket = calc_bucket(hctl, hashvalue);
825 segment_num = bucket >> hashp->sshift;
826 segment_ndx = MOD(bucket, hashp->ssize);
828 segp = hashp->dir[segment_num];
831 hash_corrupted(hashp);
833 prevBucketPtr = &segp[segment_ndx];
834 currBucket = *prevBucketPtr;
837 * Follow collision chain looking for matching key
839 match = hashp->match; /* save one fetch in inner loop */
840 keysize = hashp->keysize; /* ditto */
842 while (currBucket != NULL)
844 if (currBucket->hashvalue == hashvalue &&
845 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
847 prevBucketPtr = &(currBucket->link);
848 currBucket = *prevBucketPtr;
856 *foundPtr = (bool) (currBucket != NULL);
864 if (currBucket != NULL)
865 return (void *) ELEMENTKEY(currBucket);
869 if (currBucket != NULL)
871 /* use volatile pointer to prevent code rearrangement */
872 volatile HASHHDR *hctlv = hctl;
874 /* if partitioned, must lock to touch nentries and freeList */
875 if (IS_PARTITIONED(hctlv))
876 SpinLockAcquire(&hctlv->mutex);
878 Assert(hctlv->nentries > 0);
881 /* remove record from hash bucket's chain. */
882 *prevBucketPtr = currBucket->link;
884 /* add the record to the freelist for this table. */
885 currBucket->link = hctlv->freeList;
886 hctlv->freeList = currBucket;
888 if (IS_PARTITIONED(hctlv))
889 SpinLockRelease(&hctlv->mutex);
892 * better hope the caller is synchronizing access to this
893 * element, because someone else is going to reuse it the next
894 * time something is added to the table
896 return (void *) ELEMENTKEY(currBucket);
900 case HASH_ENTER_NULL:
901 /* ENTER_NULL does not work with palloc-based allocator */
902 Assert(hashp->alloc != DynaHashAlloc);
906 /* Return existing element if found, else create one */
907 if (currBucket != NULL)
908 return (void *) ELEMENTKEY(currBucket);
910 /* disallow inserts if frozen */
912 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
915 currBucket = get_hash_entry(hashp);
916 if (currBucket == NULL)
919 if (action == HASH_ENTER_NULL)
921 /* report a generic message */
924 (errcode(ERRCODE_OUT_OF_MEMORY),
925 errmsg("out of shared memory")));
928 (errcode(ERRCODE_OUT_OF_MEMORY),
929 errmsg("out of memory")));
932 /* link into hashbucket chain */
933 *prevBucketPtr = currBucket;
934 currBucket->link = NULL;
936 /* copy key into record */
937 currBucket->hashvalue = hashvalue;
938 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
940 /* caller is expected to fill the data field on return */
943 * Check if it is time to split a bucket. Can't split if running
944 * in partitioned mode, nor if table is the subject of any active
945 * hash_seq_search scans. Strange order of these tests is to try
946 * to check cheaper conditions first.
948 if (!IS_PARTITIONED(hctl) &&
949 hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
950 !has_seq_scans(hashp))
953 * NOTE: failure to expand table is not a fatal error, it just
954 * means we have to run at higher fill factor than we wanted.
959 return (void *) ELEMENTKEY(currBucket);
962 elog(ERROR, "unrecognized hash action code: %d", (int) action);
964 return NULL; /* keep compiler quiet */
968 * create a new entry if possible
971 get_hash_entry(HTAB *hashp)
973 /* use volatile pointer to prevent code rearrangement */
974 volatile HASHHDR *hctlv = hashp->hctl;
975 HASHBUCKET newElement;
979 /* if partitioned, must lock to touch nentries and freeList */
980 if (IS_PARTITIONED(hctlv))
981 SpinLockAcquire(&hctlv->mutex);
983 /* try to get an entry from the freelist */
984 newElement = hctlv->freeList;
985 if (newElement != NULL)
988 /* no free elements. allocate another chunk of buckets */
989 if (IS_PARTITIONED(hctlv))
990 SpinLockRelease(&hctlv->mutex);
992 if (!element_alloc(hashp, hctlv->nelem_alloc))
999 /* remove entry from freelist, bump nentries */
1000 hctlv->freeList = newElement->link;
1003 if (IS_PARTITIONED(hctlv))
1004 SpinLockRelease(&hctlv->mutex);
1010 * hash_get_num_entries -- get the number of entries in a hashtable
1013 hash_get_num_entries(HTAB *hashp)
1016 * We currently don't bother with the mutex; it's only sensible to call
1017 * this function if you've got lock on all partitions of the table.
1019 return hashp->hctl->nentries;
1023 * hash_seq_init/_search/_term
1024 * Sequentially search through hash table and return
1025 * all the elements one by one, return NULL when no more.
1027 * hash_seq_term should be called if and only if the scan is abandoned before
1028 * completion; if hash_seq_search returns NULL then it has already done the
1029 * end-of-scan cleanup.
1031 * NOTE: caller may delete the returned element before continuing the scan.
1032 * However, deleting any other element while the scan is in progress is
1033 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1034 * if elements are added to the table while the scan is in progress, it is
1035 * unspecified whether they will be visited by the scan or not.
1037 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1038 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1039 * further insertions by calling hash_freeze. This is used by nodeAgg.c,
1040 * wherein it is inconvenient to track whether a scan is still open, and
1041 * there's no possibility of further insertions after readout has begun.
1043 * NOTE: to use this with a partitioned hashtable, caller had better hold
1044 * at least shared lock on all partitions of the table throughout the scan!
1045 * We can cope with insertions or deletions by our own backend, but *not*
1046 * with concurrent insertions or deletions by another.
1049 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1051 status->hashp = hashp;
1052 status->curBucket = 0;
1053 status->curEntry = NULL;
1055 register_seq_scan(hashp);
1059 hash_seq_search(HASH_SEQ_STATUS *status)
1069 HASHELEMENT *curElem;
1071 if ((curElem = status->curEntry) != NULL)
1073 /* Continuing scan of curBucket... */
1074 status->curEntry = curElem->link;
1075 if (status->curEntry == NULL) /* end of this bucket */
1076 ++status->curBucket;
1077 return (void *) ELEMENTKEY(curElem);
1081 * Search for next nonempty bucket starting at curBucket.
1083 curBucket = status->curBucket;
1084 hashp = status->hashp;
1086 ssize = hashp->ssize;
1087 max_bucket = hctl->max_bucket;
1089 if (curBucket > max_bucket)
1091 hash_seq_term(status);
1092 return NULL; /* search is done */
1096 * first find the right segment in the table directory.
1098 segment_num = curBucket >> hashp->sshift;
1099 segment_ndx = MOD(curBucket, ssize);
1101 segp = hashp->dir[segment_num];
1104 * Pick up the first item in this bucket's chain. If chain is not empty
1105 * we can begin searching it. Otherwise we have to advance to find the
1106 * next nonempty bucket. We try to optimize that case since searching a
1107 * near-empty hashtable has to iterate this loop a lot.
1109 while ((curElem = segp[segment_ndx]) == NULL)
1111 /* empty bucket, advance to next */
1112 if (++curBucket > max_bucket)
1114 status->curBucket = curBucket;
1115 hash_seq_term(status);
1116 return NULL; /* search is done */
1118 if (++segment_ndx >= ssize)
1122 segp = hashp->dir[segment_num];
1126 /* Begin scan of curBucket... */
1127 status->curEntry = curElem->link;
1128 if (status->curEntry == NULL) /* end of this bucket */
1130 status->curBucket = curBucket;
1131 return (void *) ELEMENTKEY(curElem);
1135 hash_seq_term(HASH_SEQ_STATUS *status)
1137 if (!status->hashp->frozen)
1138 deregister_seq_scan(status->hashp);
1143 * Freeze a hashtable against future insertions (deletions are
1146 * The reason for doing this is that by preventing any more bucket splits,
1147 * we no longer need to worry about registering hash_seq_search scans,
1148 * and thus caller need not be careful about ensuring hash_seq_term gets
1149 * called at the right times.
1151 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1152 * with active scans (since hash_seq_term would then do the wrong thing).
1155 hash_freeze(HTAB *hashp)
1157 if (hashp->isshared)
1158 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1159 if (!hashp->frozen && has_seq_scans(hashp))
1160 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1162 hashp->frozen = true;
1166 /********************************* UTILITIES ************************/
1169 * Expand the table by adding one more hash bucket.
1172 expand_table(HTAB *hashp)
1174 HASHHDR *hctl = hashp->hctl;
1175 HASHSEGMENT old_seg,
1183 HASHBUCKET *oldlink,
1185 HASHBUCKET currElement,
1188 Assert(!IS_PARTITIONED(hctl));
1190 #ifdef HASH_STATISTICS
1194 new_bucket = hctl->max_bucket + 1;
1195 new_segnum = new_bucket >> hashp->sshift;
1196 new_segndx = MOD(new_bucket, hashp->ssize);
1198 if (new_segnum >= hctl->nsegs)
1200 /* Allocate new segment if necessary -- could fail if dir full */
1201 if (new_segnum >= hctl->dsize)
1202 if (!dir_realloc(hashp))
1204 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1209 /* OK, we created a new bucket */
1213 * *Before* changing masks, find old bucket corresponding to same hash
1214 * values; values in that bucket may need to be relocated to new bucket.
1215 * Note that new_bucket is certainly larger than low_mask at this point,
1216 * so we can skip the first step of the regular hash mask calc.
1218 old_bucket = (new_bucket & hctl->low_mask);
1221 * If we crossed a power of 2, readjust masks.
1223 if ((uint32) new_bucket > hctl->high_mask)
1225 hctl->low_mask = hctl->high_mask;
1226 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1230 * Relocate records to the new bucket. NOTE: because of the way the hash
1231 * masking is done in calc_bucket, only one old bucket can need to be
1232 * split at this point. With a different way of reducing the hash value,
1233 * that might not be true!
1235 old_segnum = old_bucket >> hashp->sshift;
1236 old_segndx = MOD(old_bucket, hashp->ssize);
1238 old_seg = hashp->dir[old_segnum];
1239 new_seg = hashp->dir[new_segnum];
1241 oldlink = &old_seg[old_segndx];
1242 newlink = &new_seg[new_segndx];
1244 for (currElement = *oldlink;
1245 currElement != NULL;
1246 currElement = nextElement)
1248 nextElement = currElement->link;
1249 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1251 *oldlink = currElement;
1252 oldlink = &currElement->link;
1256 *newlink = currElement;
1257 newlink = &currElement->link;
1260 /* don't forget to terminate the rebuilt hash chains... */
1269 dir_realloc(HTAB *hashp)
1277 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1280 /* Reallocate directory */
1281 new_dsize = hashp->hctl->dsize << 1;
1282 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1283 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1286 CurrentDynaHashCxt = hashp->hcxt;
1287 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1291 memcpy(p, old_p, old_dirsize);
1292 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1294 hashp->hctl->dsize = new_dsize;
1296 /* XXX assume the allocator is palloc, so we know how to free */
1297 Assert(hashp->alloc == DynaHashAlloc);
1308 seg_alloc(HTAB *hashp)
1312 CurrentDynaHashCxt = hashp->hcxt;
1313 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1318 MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1324 * allocate some new elements and link them into the free list
1327 element_alloc(HTAB *hashp, int nelem)
1329 /* use volatile pointer to prevent code rearrangement */
1330 volatile HASHHDR *hctlv = hashp->hctl;
1332 HASHELEMENT *firstElement;
1333 HASHELEMENT *tmpElement;
1334 HASHELEMENT *prevElement;
1337 /* Each element has a HASHELEMENT header plus user data. */
1338 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
1340 CurrentDynaHashCxt = hashp->hcxt;
1341 firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1346 /* prepare to link all the new entries into the freelist */
1348 tmpElement = firstElement;
1349 for (i = 0; i < nelem; i++)
1351 tmpElement->link = prevElement;
1352 prevElement = tmpElement;
1353 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1356 /* if partitioned, must lock to touch freeList */
1357 if (IS_PARTITIONED(hctlv))
1358 SpinLockAcquire(&hctlv->mutex);
1360 /* freelist could be nonempty if two backends did this concurrently */
1361 firstElement->link = hctlv->freeList;
1362 hctlv->freeList = prevElement;
1364 if (IS_PARTITIONED(hctlv))
1365 SpinLockRelease(&hctlv->mutex);
1370 /* complain when we have detected a corrupted hashtable */
1372 hash_corrupted(HTAB *hashp)
1375 * If the corruption is in a shared hashtable, we'd better force a
1376 * systemwide restart. Otherwise, just shut down this one backend.
1378 if (hashp->isshared)
1379 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1381 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1384 /* calculate ceil(log base 2) of num */
1391 for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1397 /************************* SEQ SCAN TRACKING ************************/
1400 * We track active hash_seq_search scans here. The need for this mechanism
1401 * comes from the fact that a scan will get confused if a bucket split occurs
1402 * while it's in progress: it might visit entries twice, or even miss some
1403 * entirely (if it's partway through the same bucket that splits). Hence
1404 * we want to inhibit bucket splits if there are any active scans on the
1405 * table being inserted into. This is a fairly rare case in current usage,
1406 * so just postponing the split until the next insertion seems sufficient.
1408 * Given present usages of the function, only a few scans are likely to be
1409 * open concurrently; so a finite-size stack of open scans seems sufficient,
1410 * and we don't worry that linear search is too slow. Note that we do
1411 * allow multiple scans of the same hashtable to be open concurrently.
1413 * This mechanism can support concurrent scan and insertion in a shared
1414 * hashtable if it's the same backend doing both. It would fail otherwise,
1415 * but locking reasons seem to preclude any such scenario anyway, so we don't
1418 * This arrangement is reasonably robust if a transient hashtable is deleted
1419 * without notifying us. The absolute worst case is we might inhibit splits
1420 * in another table created later at exactly the same address. We will give
1421 * a warning at transaction end for reference leaks, so any bugs leading to
1422 * lack of notification should be easy to catch.
1425 #define MAX_SEQ_SCANS 100
1427 static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1428 static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1429 static int num_seq_scans = 0;
1432 /* Register a table as having an active hash_seq_search scan */
1434 register_seq_scan(HTAB *hashp)
1436 if (num_seq_scans >= MAX_SEQ_SCANS)
1437 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1439 seq_scan_tables[num_seq_scans] = hashp;
1440 seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1444 /* Deregister an active scan */
1446 deregister_seq_scan(HTAB *hashp)
1450 /* Search backward since it's most likely at the stack top */
1451 for (i = num_seq_scans - 1; i >= 0; i--)
1453 if (seq_scan_tables[i] == hashp)
1455 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1456 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1461 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1465 /* Check if a table has any active scan */
1467 has_seq_scans(HTAB *hashp)
1471 for (i = 0; i < num_seq_scans; i++)
1473 if (seq_scan_tables[i] == hashp)
1479 /* Clean up any open scans at end of transaction */
1481 AtEOXact_HashTables(bool isCommit)
1484 * During abort cleanup, open scans are expected; just silently clean 'em
1485 * out. An open scan at commit means someone forgot a hash_seq_term()
1486 * call, so complain.
1488 * Note: it's tempting to try to print the tabname here, but refrain for
1489 * fear of touching deallocated memory. This isn't a user-facing message
1490 * anyway, so it needn't be pretty.
1496 for (i = 0; i < num_seq_scans; i++)
1498 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1499 seq_scan_tables[i]);
1505 /* Clean up any open scans at end of subtransaction */
1507 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1512 * Search backward to make cleanup easy. Note we must check all entries,
1513 * not only those at the end of the array, because deletion technique
1514 * doesn't keep them in order.
1516 for (i = num_seq_scans - 1; i >= 0; i--)
1518 if (seq_scan_level[i] >= nestDepth)
1521 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1522 seq_scan_tables[i]);
1523 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1524 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];