From 51ee9fa1574e1826dde4012ecb07455d73fb1444 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 22 Jul 2006 23:04:39 +0000 Subject: [PATCH] Add support to dynahash.c for partitioning shared hashtables according to the low-order bits of the entry hash value. Also make some incidental cleanups in the dynahash API, such as not exporting the hash header structs to the world. --- src/backend/storage/ipc/shmem.c | 37 +-- src/backend/storage/lmgr/lock.c | 4 +- src/backend/utils/hash/dynahash.c | 428 +++++++++++++++++++++++++----- src/include/utils/hsearch.h | 98 ++----- 4 files changed, 387 insertions(+), 180 deletions(-) diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 3bce41d4ab..6a65429f0f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.93 2006/07/14 14:52:22 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.94 2006/07/22 23:04:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -211,9 +211,6 @@ InitShmemIndex(void) { HASHCTL info; int hash_flags; - ShmemIndexEnt *result, - item; - bool found; /* * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex @@ -227,32 +224,11 @@ InitShmemIndex(void) info.entrysize = sizeof(ShmemIndexEnt); hash_flags = HASH_ELEM; - /* This will acquire the shmem index lock, but not release it. */ ShmemIndex = ShmemInitHash("ShmemIndex", SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, &info, hash_flags); if (!ShmemIndex) elog(FATAL, "could not initialize Shmem Index"); - - /* - * Now, create an entry in the hashtable for the index itself. - */ - if (!IsUnderPostmaster) - { - MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE); - strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE); - - result = (ShmemIndexEnt *) - hash_search(ShmemIndex, (void *) &item, HASH_ENTER, &found); - - Assert(!found); - - result->location = MAKE_OFFSET(ShmemIndex->hctl); - result->size = SHMEM_INDEX_SIZE; - } - - /* now release the lock acquired in ShmemInitStruct */ - LWLockRelease(ShmemIndexLock); } /* @@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ /* look it up in the shmem index */ location = ShmemInitStruct(name, - sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT), + hash_get_shared_size(infoP, hash_flags), &found); /* @@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ if (found) hash_flags |= HASH_ATTACH; - /* Now provide the header and directory pointers */ + /* Pass location of hashtable header to hash_create */ infoP->hctl = (HASHHDR *) location; - infoP->dir = (HASHSEGMENT *) (((char *) location) + sizeof(HASHHDR)); return hash_create(name, init_size, infoP, hash_flags); } @@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * If the shmem index doesn't exist, we are bootstrapping: we must * be trying to init the shmem index itself. * - * Notice that the ShmemIndexLock is held until the shmem index - * has been completely initialized. + * Notice that the ShmemIndexLock is released before the shmem + * index has been initialized. This should be OK because no + * other process can be accessing shared memory yet. */ Assert(shmemseghdr->indexoffset == 0); structPtr = ShmemAlloc(size); shmemseghdr->indexoffset = MAKE_OFFSET(structPtr); *foundPtr = FALSE; } + LWLockRelease(ShmemIndexLock); return structPtr; } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 472b3d1f4b..cf78869a0b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.166 2006/07/14 14:52:23 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.167 2006/07/22 23:04:39 tgl Exp $ * * NOTES * A lock table is a shared memory hash table. When @@ -1958,7 +1958,7 @@ GetLockStatusData(void) { LWLockAcquire(FirstLockMgrLock + i, LW_SHARED); proclockTable = LockMethodProcLockHash[i]; - els += proclockTable->hctl->nentries; + els += hash_get_num_entries(proclockTable); } data->nelements = els; diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index f98b949423..23200667af 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -3,17 +3,36 @@ * dynahash.c * dynamic hash tables * + * dynahash.c supports both local-to-a-backend hash tables and hash tables in + * shared memory. For shared hash tables, it is the caller's responsibility + * to provide appropriate access interlocking. The simplest convention is + * that a single LWLock protects the whole hash table. Searches (HASH_FIND or + * hash_seq_search) need only shared lock, but any update requires exclusive + * lock. For heavily-used shared tables, the single-lock approach creates a + * concurrency bottleneck, so we also support "partitioned" locking wherein + * there are multiple LWLocks guarding distinct subsets of the table. To use + * a hash table in partitioned mode, the HASH_PARTITION flag must be given + * to hash_create. This prevents any attempt to split buckets on-the-fly. + * Therefore, each hash bucket chain operates independently, and no fields + * of the hash header change after init except nentries and freeList. + * A partitioned table uses a spinlock to guard changes of those two fields. + * This lets any subset of the hash buckets be treated as a separately + * lockable partition. We expect callers to use the low-order bits of a + * lookup key's hash value as a partition number --- this will work because + * of the way calc_bucket() maps hash values to bucket numbers. * * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.69 2006/07/14 14:52:25 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.70 2006/07/22 23:04:39 tgl Exp $ * *------------------------------------------------------------------------- */ + /* + * Original comments: * * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. * Coded into C, with minor code improvements, and with hsearch(3) interface, @@ -45,9 +64,107 @@ #include "postgres.h" #include "storage/shmem.h" +#include "storage/spin.h" #include "utils/dynahash.h" #include "utils/memutils.h" + +/* + * Constants + * + * A hash table has a top-level "directory", each of whose entries points + * to a "segment" of ssize bucket headers. The maximum number of hash + * buckets is thus dsize * ssize (but dsize may be expansible). Of course, + * the number of records in the table can be larger, but we don't want a + * whole lot of records per bucket or performance goes down. + * + * In a hash table allocated in shared memory, the directory cannot be + * expanded because it must stay at a fixed address. The directory size + * should be selected using hash_select_dirsize (and you'd better have + * a good idea of the maximum number of entries!). For non-shared hash + * tables, the initial directory size can be left at the default. + */ +#define DEF_SEGSIZE 256 +#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ +#define DEF_DIRSIZE 256 +#define DEF_FFACTOR 1 /* default fill factor */ + + +/* A hash bucket is a linked list of HASHELEMENTs */ +typedef HASHELEMENT *HASHBUCKET; + +/* A hash segment is an array of bucket headers */ +typedef HASHBUCKET *HASHSEGMENT; + +/* + * Header structure for a hash table --- contains all changeable info + * + * In a shared-memory hash table, the HASHHDR is in shared memory, while + * each backend has a local HTAB struct. For a non-shared table, there isn't + * any functional difference between HASHHDR and HTAB, but we separate them + * anyway to share code between shared and non-shared tables. + */ +struct HASHHDR +{ + /* In a partitioned table, take this lock to touch nentries or freeList */ + slock_t mutex; /* unused if not partitioned table */ + + /* These fields change during entry addition/deletion */ + long nentries; /* number of entries in hash table */ + HASHELEMENT *freeList; /* linked list of free elements */ + + /* These fields can change, but not in a partitioned table */ + /* Also, dsize can't change in a shared table, even if unpartitioned */ + long dsize; /* directory size */ + long nsegs; /* number of allocated segments (<= dsize) */ + uint32 max_bucket; /* ID of maximum bucket in use */ + uint32 high_mask; /* mask to modulo into entire table */ + uint32 low_mask; /* mask to modulo into lower half of table */ + + /* These fields are fixed at hashtable creation */ + Size keysize; /* hash key length in bytes */ + Size entrysize; /* total user element size in bytes */ + long num_partitions; /* # partitions (must be power of 2), or 0 */ + long ffactor; /* target fill factor */ + long max_dsize; /* 'dsize' limit if directory is fixed size */ + long ssize; /* segment size --- must be power of 2 */ + int sshift; /* segment shift = log2(ssize) */ + int nelem_alloc; /* number of entries to allocate at once */ + +#ifdef HASH_STATISTICS + /* + * Count statistics here. NB: stats code doesn't bother with mutex, + * so counts could be corrupted a bit in a partitioned table. + */ + long accesses; + long collisions; +#endif +}; + +#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) + +/* + * Top control structure for a hashtable --- in a shared table, each backend + * has its own copy (OK since no fields change at runtime) + */ +struct HTAB +{ + HASHHDR *hctl; /* => shared control information */ + HASHSEGMENT *dir; /* directory of segment starts */ + HashValueFunc hash; /* hash function */ + HashCompareFunc match; /* key comparison function */ + HashCopyFunc keycopy; /* key copying function */ + HashAllocFunc alloc; /* memory allocator */ + MemoryContext hcxt; /* memory context if default allocator used */ + char *tabname; /* table name (for error messages) */ + bool isshared; /* true if table is in shared memory */ + + /* We keep local copies of these fixed values to reduce contention */ + Size keysize; /* hash key length in bytes */ + long ssize; /* segment size --- must be power of 2 */ + int sshift; /* segment shift = log2(ssize) */ +}; + /* * Key (also entry) part of a HASHELEMENT */ @@ -58,6 +175,12 @@ */ #define MOD(x,y) ((x) & ((y)-1)) +#if HASH_STATISTICS +static long hash_accesses, + hash_collisions, + hash_expansions; +#endif + /* * Private function prototypes */ @@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp); static bool element_alloc(HTAB *hashp, int nelem); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); +static HASHBUCKET get_hash_entry(HTAB *hashp); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); @@ -85,13 +209,6 @@ DynaHashAlloc(Size size) } -#if HASH_STATISTICS -static long hash_accesses, - hash_collisions, - hash_expansions; -#endif - - /************************** CREATE ROUTINES **********************/ /* @@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if (flags & HASH_SHARED_MEM) { /* - * ctl structure is preallocated for shared memory tables. Note that - * HASH_DIRSIZE and HASH_ALLOC had better be set as well. + * ctl structure and directory are preallocated for shared memory + * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set + * as well. */ hashp->hctl = info->hctl; - hashp->dir = info->dir; + hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); hashp->hcxt = NULL; hashp->isshared = true; /* hash table already exists, we're just attaching to it */ if (flags & HASH_ATTACH) + { + /* make local copies of some heavily-used values */ + hctl = hashp->hctl; + hashp->keysize = hctl->keysize; + hashp->ssize = hctl->ssize; + hashp->sshift = hctl->sshift; + return hashp; + } } else { @@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) hdefault(hashp); hctl = hashp->hctl; -#ifdef HASH_STATISTICS - hctl->accesses = hctl->collisions = 0; -#endif + + if (flags & HASH_PARTITION) + { + /* Doesn't make sense to partition a local hash table */ + Assert(flags & HASH_SHARED_MEM); + /* # of partitions had better be a power of 2 */ + Assert(info->num_partitions == (1L << my_log2(info->num_partitions))); + + hctl->num_partitions = info->num_partitions; + } if (flags & HASH_SEGMENT) { @@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) hctl->entrysize = info->entrysize; } + /* make local copies of heavily-used constant fields */ + hashp->keysize = hctl->keysize; + hashp->ssize = hctl->ssize; + hashp->sshift = hctl->sshift; + /* Build the hash directory structure */ if (!init_htab(hashp, nelem)) { @@ -292,22 +430,29 @@ hdefault(HTAB *hashp) MemSet(hctl, 0, sizeof(HASHHDR)); - hctl->ssize = DEF_SEGSIZE; - hctl->sshift = DEF_SEGSIZE_SHIFT; - hctl->dsize = DEF_DIRSIZE; - hctl->ffactor = DEF_FFACTOR; hctl->nentries = 0; + hctl->freeList = NULL; + + hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; /* rather pointless defaults for key & entry size */ hctl->keysize = sizeof(char *); hctl->entrysize = 2 * sizeof(char *); + hctl->num_partitions = 0; /* not partitioned */ + + hctl->ffactor = DEF_FFACTOR; + /* table has no fixed maximum size */ hctl->max_dsize = NO_MAX_DSIZE; - /* garbage collection for HASH_REMOVE */ - hctl->freeList = NULL; + hctl->ssize = DEF_SEGSIZE; + hctl->sshift = DEF_SEGSIZE_SHIFT; + +#ifdef HASH_STATISTICS + hctl->accesses = hctl->collisions = 0; +#endif } /* @@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize) return nelem_alloc; } +/* + * Compute derived fields of hctl and build the initial directory/segment + * arrays + */ static bool init_htab(HTAB *hashp, long nelem) { @@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem) int nbuckets; int nsegs; + /* + * initialize mutex if it's a partitioned table + */ + if (IS_PARTITIONED(hctl)) + SpinLockInit(&hctl->mutex); + /* * Divide number of elements by the fill factor to determine a desired * number of buckets. Allocate space for the next greater power of two @@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem) nbuckets = 1 << my_log2(lnbuckets); + /* + * In a partitioned table, nbuckets must be at least equal to + * num_partitions; were it less, keys with apparently different partition + * numbers would map to the same bucket, breaking partition independence. + * (Normally nbuckets will be much bigger; this is just a safety check.) + */ + while (nbuckets < hctl->num_partitions) + nbuckets <<= 1; + hctl->max_bucket = hctl->low_mask = nbuckets - 1; hctl->high_mask = (nbuckets << 1) - 1; @@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries) return nDirEntries; } +/* + * Compute the required initial memory allocation for a shared-memory + * hashtable with the given parameters. We need space for the HASHHDR + * and for the (non expansible) directory. + */ +Size +hash_get_shared_size(HASHCTL *info, int flags) +{ + Assert(flags & HASH_DIRSIZE); + Assert(info->dsize == info->max_dsize); + return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT); +} + /********************** DESTROY ROUTINES ************************/ @@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp) where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hashp->hctl->nentries, hashp->hctl->keysize, + hashp->hctl->nentries, (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); @@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp) /*******************************SEARCH ROUTINES *****************************/ +/* + * get_hash_value -- exported routine to calculate a key's hash value + * + * We export this because for partitioned tables, callers need to compute + * the partition number (from the low-order bits of the hash value) before + * searching. + */ +uint32 +get_hash_value(HTAB *hashp, const void *keyPtr) +{ + return hashp->hash(keyPtr, hashp->keysize); +} + /* Convert a hash value to a bucket number */ static inline uint32 calc_bucket(HASHHDR *hctl, uint32 hash_val) @@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val) return bucket; } -/*---------- +/* * hash_search -- look up key in table and perform action + * hash_search_with_hash_value -- same, with key's hash value already computed * * action is one of: * HASH_FIND: look up key in table @@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val) * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an * existing entry in the table, FALSE otherwise. This is needed in the * HASH_ENTER case, but is redundant with the return value otherwise. - *---------- + * + * For hash_search_with_hash_value, the hashvalue parameter must have been + * calculated with get_hash_value(). */ void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr) +{ + return hash_search_with_hash_value(hashp, + keyPtr, + hashp->hash(keyPtr, hashp->keysize), + action, + foundPtr); +} + +void * +hash_search_with_hash_value(HTAB *hashp, + const void *keyPtr, + uint32 hashvalue, + HASHACTION action, + bool *foundPtr) { HASHHDR *hctl = hashp->hctl; - Size keysize = hctl->keysize; - uint32 hashvalue; + Size keysize; uint32 bucket; long segment_num; long segment_ndx; @@ -595,11 +801,10 @@ hash_search(HTAB *hashp, /* * Do the initial lookup */ - hashvalue = hashp->hash(keyPtr, keysize); bucket = calc_bucket(hctl, hashvalue); - segment_num = bucket >> hctl->sshift; - segment_ndx = MOD(bucket, hctl->ssize); + segment_num = bucket >> hashp->sshift; + segment_ndx = MOD(bucket, hashp->ssize); segp = hashp->dir[segment_num]; @@ -613,6 +818,7 @@ hash_search(HTAB *hashp, * Follow collision chain looking for matching key */ match = hashp->match; /* save one fetch in inner loop */ + keysize = hashp->keysize; /* ditto */ while (currBucket != NULL) { @@ -643,15 +849,25 @@ hash_search(HTAB *hashp, case HASH_REMOVE: if (currBucket != NULL) { - Assert(hctl->nentries > 0); - hctl->nentries--; + /* use volatile pointer to prevent code rearrangement */ + volatile HASHHDR *hctlv = hctl; + + /* if partitioned, must lock to touch nentries and freeList */ + if (IS_PARTITIONED(hctlv)) + SpinLockAcquire(&hctlv->mutex); + + Assert(hctlv->nentries > 0); + hctlv->nentries--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; /* add the record to the freelist for this table. */ - currBucket->link = hctl->freeList; - hctl->freeList = currBucket; + currBucket->link = hctlv->freeList; + hctlv->freeList = currBucket; + + if (IS_PARTITIONED(hctlv)) + SpinLockRelease(&hctlv->mutex); /* * better hope the caller is synchronizing access to this @@ -672,32 +888,23 @@ hash_search(HTAB *hashp, if (currBucket != NULL) return (void *) ELEMENTKEY(currBucket); - /* get the next free element */ - currBucket = hctl->freeList; + currBucket = get_hash_entry(hashp); if (currBucket == NULL) { - /* no free elements. allocate another chunk of buckets */ - if (!element_alloc(hashp, hctl->nelem_alloc)) - { - /* out of memory */ - if (action == HASH_ENTER_NULL) - return NULL; - /* report a generic message */ - if (hashp->isshared) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory"))); - else - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - currBucket = hctl->freeList; - Assert(currBucket != NULL); + /* out of memory */ + if (action == HASH_ENTER_NULL) + return NULL; + /* report a generic message */ + if (hashp->isshared) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + else + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); } - hctl->freeList = currBucket->link; - /* link into hashbucket chain */ *prevBucketPtr = currBucket; currBucket->link = NULL; @@ -708,8 +915,10 @@ hash_search(HTAB *hashp, /* caller is expected to fill the data field on return */ - /* Check if it is time to split the segment */ - if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor) + /* Check if it is time to split a bucket */ + /* Can't split if running in partitioned mode */ + if (!IS_PARTITIONED(hctl) && + hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor) { /* * NOTE: failure to expand table is not a fatal error, it just @@ -726,6 +935,61 @@ hash_search(HTAB *hashp, return NULL; /* keep compiler quiet */ } +/* + * create a new entry if possible + */ +static HASHBUCKET +get_hash_entry(HTAB *hashp) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile HASHHDR *hctlv = hashp->hctl; + HASHBUCKET newElement; + + for (;;) + { + /* if partitioned, must lock to touch nentries and freeList */ + if (IS_PARTITIONED(hctlv)) + SpinLockAcquire(&hctlv->mutex); + + /* try to get an entry from the freelist */ + newElement = hctlv->freeList; + if (newElement != NULL) + break; + + /* no free elements. allocate another chunk of buckets */ + if (IS_PARTITIONED(hctlv)) + SpinLockRelease(&hctlv->mutex); + + if (!element_alloc(hashp, hctlv->nelem_alloc)) + { + /* out of memory */ + return NULL; + } + } + + /* remove entry from freelist, bump nentries */ + hctlv->freeList = newElement->link; + hctlv->nentries++; + + if (IS_PARTITIONED(hctlv)) + SpinLockRelease(&hctlv->mutex); + + return newElement; +} + +/* + * hash_get_num_entries -- get the number of entries in a hashtable + */ +long +hash_get_num_entries(HTAB *hashp) +{ + /* + * We currently don't bother with the mutex; it's only sensible to call + * this function if you've got lock on all partitions of the table. + */ + return hashp->hctl->nentries; +} + /* * hash_seq_init/_search * Sequentially search through hash table and return @@ -736,6 +1000,9 @@ hash_search(HTAB *hashp, * UNDEFINED (it might be the one that curIndex is pointing at!). Also, * if elements are added to the table while the scan is in progress, it is * unspecified whether they will be visited by the scan or not. + * + * NOTE: to use this with a partitioned hashtable, caller had better hold + * at least shared lock on all partitions of the table throughout the scan! */ void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) @@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status) curBucket = status->curBucket; hashp = status->hashp; hctl = hashp->hctl; - ssize = hctl->ssize; + ssize = hashp->ssize; max_bucket = hctl->max_bucket; if (curBucket > max_bucket) @@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status) /* * first find the right segment in the table directory. */ - segment_num = curBucket >> hctl->sshift; + segment_num = curBucket >> hashp->sshift; segment_ndx = MOD(curBucket, ssize); segp = hashp->dir[segment_num]; @@ -840,13 +1107,15 @@ expand_table(HTAB *hashp) HASHBUCKET currElement, nextElement; + Assert(!IS_PARTITIONED(hctl)); + #ifdef HASH_STATISTICS hash_expansions++; #endif new_bucket = hctl->max_bucket + 1; - new_segnum = new_bucket >> hctl->sshift; - new_segndx = MOD(new_bucket, hctl->ssize); + new_segnum = new_bucket >> hashp->sshift; + new_segndx = MOD(new_bucket, hashp->ssize); if (new_segnum >= hctl->nsegs) { @@ -885,8 +1154,8 @@ expand_table(HTAB *hashp) * split at this point. With a different way of reducing the hash value, * that might not be true! */ - old_segnum = old_bucket >> hctl->sshift; - old_segndx = MOD(old_bucket, hctl->ssize); + old_segnum = old_bucket >> hashp->sshift; + old_segndx = MOD(old_bucket, hashp->ssize); old_seg = hashp->dir[old_segnum]; new_seg = hashp->dir[new_segnum]; @@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp) HASHSEGMENT segp; CurrentDynaHashCxt = hashp->hcxt; - segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->hctl->ssize); + segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize); if (!segp) return NULL; - MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->hctl->ssize); + MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize); return segp; } @@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp) static bool element_alloc(HTAB *hashp, int nelem) { - HASHHDR *hctl = hashp->hctl; + /* use volatile pointer to prevent code rearrangement */ + volatile HASHHDR *hctlv = hashp->hctl; Size elementSize; + HASHELEMENT *firstElement; HASHELEMENT *tmpElement; + HASHELEMENT *prevElement; int i; /* Each element has a HASHELEMENT header plus user data. */ - elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); + elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize); CurrentDynaHashCxt = hashp->hcxt; - tmpElement = (HASHELEMENT *) - hashp->alloc(nelem * elementSize); + firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); - if (!tmpElement) + if (!firstElement) return false; - /* link all the new entries into the freelist */ + /* prepare to link all the new entries into the freelist */ + prevElement = NULL; + tmpElement = firstElement; for (i = 0; i < nelem; i++) { - tmpElement->link = hctl->freeList; - hctl->freeList = tmpElement; + tmpElement->link = prevElement; + prevElement = tmpElement; tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); } + /* if partitioned, must lock to touch freeList */ + if (IS_PARTITIONED(hctlv)) + SpinLockAcquire(&hctlv->mutex); + + /* freelist could be nonempty if two backends did this concurrently */ + firstElement->link = hctlv->freeList; + hctlv->freeList = prevElement; + + if (IS_PARTITIONED(hctlv)) + SpinLockRelease(&hctlv->mutex); + return true; } diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index a89be514d9..c682cf0329 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * hsearch.h - * for hash tables, particularly hash tables in shared memory + * exported definitions for utils/hash/dynahash.c; see notes therein * * * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.43 2006/06/25 18:29:49 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.44 2006/07/22 23:04:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize); */ typedef void *(*HashAllocFunc) (Size request); -/* - * Constants - * - * A hash table has a top-level "directory", each of whose entries points - * to a "segment" of ssize bucket headers. The maximum number of hash - * buckets is thus dsize * ssize (but dsize may be expansible). Of course, - * the number of records in the table can be larger, but we don't want a - * whole lot of records per bucket or performance goes down. - * - * In a hash table allocated in shared memory, the directory cannot be - * expanded because it must stay at a fixed address. The directory size - * should be selected using hash_select_dirsize (and you'd better have - * a good idea of the maximum number of entries!). For non-shared hash - * tables, the initial directory size can be left at the default. - */ -#define DEF_SEGSIZE 256 -#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ -#define DEF_DIRSIZE 256 -#define DEF_FFACTOR 1 /* default fill factor */ - - /* * HASHELEMENT is the private part of a hashtable entry. The caller's data * follows the HASHELEMENT structure (on a MAXALIGN'd boundary). The hash key @@ -75,81 +54,42 @@ typedef struct HASHELEMENT uint32 hashvalue; /* hash function result for this entry */ } HASHELEMENT; -/* A hash bucket is a linked list of HASHELEMENTs */ -typedef HASHELEMENT *HASHBUCKET; +/* Hash table header struct is an opaque type known only within dynahash.c */ +typedef struct HASHHDR HASHHDR; -/* A hash segment is an array of bucket headers */ -typedef HASHBUCKET *HASHSEGMENT; - -/* Header structure for a hash table --- contains all changeable info */ -typedef struct HASHHDR -{ - long dsize; /* Directory Size */ - long ssize; /* Segment Size --- must be power of 2 */ - int sshift; /* Segment shift = log2(ssize) */ - uint32 max_bucket; /* ID of Maximum bucket in use */ - uint32 high_mask; /* Mask to modulo into entire table */ - uint32 low_mask; /* Mask to modulo into lower half of table */ - long ffactor; /* Fill factor */ - long nentries; /* Number of entries in hash table */ - long nsegs; /* Number of allocated segments */ - Size keysize; /* hash key length in bytes */ - Size entrysize; /* total user element size in bytes */ - long max_dsize; /* 'dsize' limit if directory is fixed size */ - int nelem_alloc; /* number of entries to allocate at once */ - HASHELEMENT *freeList; /* linked list of free elements */ -#ifdef HASH_STATISTICS - long accesses; - long collisions; -#endif -} HASHHDR; - -/* - * Top control structure for a hashtable --- need not be shared, since - * no fields change at runtime - */ -typedef struct HTAB -{ - HASHHDR *hctl; /* shared control information */ - HASHSEGMENT *dir; /* directory of segment starts */ - HashValueFunc hash; /* hash function */ - HashCompareFunc match; /* key comparison function */ - HashCopyFunc keycopy; /* key copying function */ - HashAllocFunc alloc; /* memory allocator */ - MemoryContext hcxt; /* memory context if default allocator used */ - char *tabname; /* table name (for error messages) */ - bool isshared; /* true if table is in shared memory */ -} HTAB; +/* Hash table control struct is an opaque type known only within dynahash.c */ +typedef struct HTAB HTAB; /* Parameter data structure for hash_create */ /* Only those fields indicated by hash_flags need be set */ typedef struct HASHCTL { - long ssize; /* Segment Size */ - long dsize; /* (initial) Directory Size */ - long max_dsize; /* limit to dsize if directory size is limited */ - long ffactor; /* Fill factor */ + long num_partitions; /* # partitions (must be power of 2) */ + long ssize; /* segment size */ + long dsize; /* (initial) directory size */ + long max_dsize; /* limit to dsize if dir size is limited */ + long ffactor; /* fill factor */ Size keysize; /* hash key length in bytes */ Size entrysize; /* total user element size in bytes */ HashValueFunc hash; /* hash function */ HashCompareFunc match; /* key comparison function */ HashCopyFunc keycopy; /* key copying function */ HashAllocFunc alloc; /* memory allocator */ - HASHSEGMENT *dir; /* directory of segment starts */ - HASHHDR *hctl; /* location of header in shared mem */ MemoryContext hcxt; /* memory context to use for allocations */ + HASHHDR *hctl; /* location of header in shared mem */ } HASHCTL; /* Flags to indicate which parameters are supplied */ +#define HASH_PARTITION 0x001 /* Hashtable is used w/partitioned locking */ #define HASH_SEGMENT 0x002 /* Set segment size */ -#define HASH_DIRSIZE 0x004 /* Set directory size */ +#define HASH_DIRSIZE 0x004 /* Set directory size (initial and max) */ #define HASH_FFACTOR 0x008 /* Set fill factor */ #define HASH_FUNCTION 0x010 /* Set user defined hash function */ -#define HASH_ELEM 0x020 /* Set key/entry size */ +#define HASH_ELEM 0x020 /* Set keysize and entrysize */ #define HASH_SHARED_MEM 0x040 /* Hashtable is in shared memory */ #define HASH_ATTACH 0x080 /* Do not initialize hctl */ #define HASH_ALLOC 0x100 /* Set memory allocator */ -#define HASH_CONTEXT 0x200 /* Set explicit memory context */ +#define HASH_CONTEXT 0x200 /* Set memory allocation context */ #define HASH_COMPARE 0x400 /* Set user defined comparison function */ #define HASH_KEYCOPY 0x800 /* Set user defined key-copying function */ @@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp); extern void hash_stats(const char *where, HTAB *hashp); extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr); +extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr); +extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, + uint32 hashvalue, HASHACTION action, + bool *foundPtr); +extern long hash_get_num_entries(HTAB *hashp); extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp); extern void *hash_seq_search(HASH_SEQ_STATUS *status); extern Size hash_estimate_size(long num_entries, Size entrysize); extern long hash_select_dirsize(long num_entries); +extern Size hash_get_shared_size(HASHCTL *info, int flags); /* * prototypes for functions in hashfn.c -- 2.40.0