From 8c0d7bafad36434cb08ac2c78e69ae72c194ca20 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 22 Aug 2017 22:41:32 -0700 Subject: [PATCH] Hash tables backed by DSA shared memory. Add general purpose chaining hash tables for DSA memory. Unlike DynaHash in shared memory mode, these hash tables can grow as required, and cope with being mapped into different addresses in different backends. There is a wide range of potential users for such a hash table, though it's very likely the interface will need to evolve as we come to understand the needs of different kinds of users. E.g support for iterators and incremental resizing is planned for later commits and the details of the callback signatures are likely to change. Author: Thomas Munro Reviewed-By: John Gorman, Andres Freund, Dilip Kumar, Robert Haas Discussion: https://postgr.es/m/CAEepm=3d8o8XdVwYT6O=bHKsKAM2pu2D6sV1S_=4d+jStVCE7w@mail.gmail.com https://postgr.es/m/CAEepm=0ZtQ-SpsgCyzzYpsXS6e=kZWqk3g5Ygn3MDV7A8dabUA@mail.gmail.com --- src/backend/lib/Makefile | 4 +- src/backend/lib/dshash.c | 889 +++++++++++++++++++++++++++++++ src/include/lib/dshash.h | 107 ++++ src/tools/pgindent/typedefs.list | 7 + 4 files changed, 1005 insertions(+), 2 deletions(-) create mode 100644 src/backend/lib/dshash.c create mode 100644 src/include/lib/dshash.h diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index f222c6c20d..d1fefe43f2 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/lib top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = binaryheap.o bipartite_match.o hyperloglog.o ilist.o knapsack.o \ - pairingheap.o rbtree.o stringinfo.o +OBJS = binaryheap.o bipartite_match.o dshash.o hyperloglog.o ilist.o \ + knapsack.o pairingheap.o rbtree.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c new file mode 100644 index 0000000000..988d569b84 --- /dev/null +++ b/src/backend/lib/dshash.c @@ -0,0 +1,889 @@ +/*------------------------------------------------------------------------- + * + * dshash.c + * Concurrent hash tables backed by dynamic shared memory areas. + * + * This is an open hashing hash table, with a linked list at each table + * entry. It supports dynamic resizing, as required to prevent the linked + * lists from growing too long on average. Currently, only growing is + * supported: the hash table never becomes smaller. + * + * To deal with concurrency, it has a fixed size set of partitions, each of + * which is independently locked. Each bucket maps to a partition; so insert, + * find and iterate operations normally only acquire one lock. Therefore, + * good concurrency is achieved whenever such operations don't collide at the + * lock partition level. However, when a resize operation begins, all + * partition locks must be acquired simultaneously for a brief period. This + * is only expected to happen a small number of times until a stable size is + * found, since growth is geometric. + * + * Future versions may support iterators and incremental resizing; for now + * the implementation is minimalist. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/lib/dshash.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/dshash.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" +#include "utils/memutils.h" + +/* + * An item in the hash table. This wraps the user's entry object in an + * envelop that holds a pointer back to the bucket and a pointer to the next + * item in the bucket. + */ +struct dshash_table_item +{ + /* The next item in the same bucket. */ + dsa_pointer next; + /* The hashed key, to avoid having to recompute it. */ + dshash_hash hash; + /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */ +}; + +/* + * The number of partitions for locking purposes. This is set to match + * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for + * the buffer pool must be good enough for any other purpose. This could + * become a runtime parameter in future. + */ +#define DSHASH_NUM_PARTITIONS_LOG2 7 +#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2) + +/* A magic value used to identify our hash tables. */ +#define DSHASH_MAGIC 0x75ff6a20 + +/* + * Tracking information for each lock partition. Initially, each partition + * corresponds to one bucket, but each time the hash table grows, the buckets + * covered by each partition split so the number of buckets covered doubles. + * + * We might want to add padding here so that each partition is on a different + * cache line, but doing so would bloat this structure considerably. + */ +typedef struct dshash_partition +{ + LWLock lock; /* Protects all buckets in this partition. */ + size_t count; /* # of items in this partition's buckets */ +} dshash_partition; + +/* + * The head object for a hash table. This will be stored in dynamic shared + * memory. + */ +typedef struct dshash_table_control +{ + dshash_table_handle handle; + uint32 magic; + dshash_partition partitions[DSHASH_NUM_PARTITIONS]; + int lwlock_tranche_id; + + /* + * The following members are written to only when ALL partitions locks are + * held. They can be read when any one partition lock is held. + */ + + /* Number of buckets expressed as power of 2 (8 = 256 buckets). */ + size_t size_log2; /* log2(number of buckets) */ + dsa_pointer buckets; /* current bucket array */ +} dshash_table_control; + +/* + * Per-backend state for a dynamic hash table. + */ +struct dshash_table +{ + dsa_area *area; /* Backing dynamic shared memory area. */ + dshash_parameters params; /* Parameters. */ + void *arg; /* User-supplied data pointer. */ + dshash_table_control *control; /* Control object in DSM. */ + dsa_pointer *buckets; /* Current bucket pointers in DSM. */ + size_t size_log2; /* log2(number of buckets) */ + bool find_locked; /* Is any partition lock held by 'find'? */ + bool find_exclusively_locked; /* ... exclusively? */ +}; + +/* Given a pointer to an item, find the entry (user data) it holds. */ +#define ENTRY_FROM_ITEM(item) \ + ((char *)(item) + MAXALIGN(sizeof(dshash_table_item))) + +/* Given a pointer to an entry, find the item that holds it. */ +#define ITEM_FROM_ENTRY(entry) \ + ((dshash_table_item *)((char *)(entry) - \ + MAXALIGN(sizeof(dshash_table_item)))) + +/* How many resize operations (bucket splits) have there been? */ +#define NUM_SPLITS(size_log2) \ + (size_log2 - DSHASH_NUM_PARTITIONS_LOG2) + +/* How many buckets are there in each partition at a given size? */ +#define BUCKETS_PER_PARTITION(size_log2) \ + (UINT64CONST(1) << NUM_SPLITS(size_log2)) + +/* Max entries before we need to grow. Half + quarter = 75% load factor. */ +#define MAX_COUNT_PER_PARTITION(hash_table) \ + (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \ + BUCKETS_PER_PARTITION(hash_table->size_log2) / 4) + +/* Choose partition based on the highest order bits of the hash. */ +#define PARTITION_FOR_HASH(hash) \ + (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2)) + +/* + * Find the bucket index for a given hash and table size. Each time the table + * doubles in size, the appropriate bucket for a given hash value doubles and + * possibly adds one, depending on the newly revealed bit, so that all buckets + * are split. + */ +#define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \ + (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2))) + +/* The index of the first bucket in a given partition. */ +#define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \ + ((partition) << NUM_SPLITS(size_log2)) + +/* The head of the active bucket for a given hash value (lvalue). */ +#define BUCKET_FOR_HASH(hash_table, hash) \ + (hash_table->buckets[ \ + BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \ + hash_table->size_log2)]) + +static void delete_item(dshash_table *hash_table, + dshash_table_item *item); +static void resize(dshash_table *hash_table, size_t new_size); +static inline void ensure_valid_bucket_pointers(dshash_table *hash_table); +static inline dshash_table_item *find_in_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer item_pointer); +static void insert_item_into_bucket(dshash_table *hash_table, + dsa_pointer item_pointer, + dshash_table_item *item, + dsa_pointer *bucket); +static dshash_table_item *insert_into_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket); +static bool delete_key_from_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket_head); +static bool delete_item_from_bucket(dshash_table *hash_table, + dshash_table_item *item, + dsa_pointer *bucket_head); +static inline dshash_hash hash_key(dshash_table *hash_table, const void *key); +static inline bool equal_keys(dshash_table *hash_table, + const void *a, const void *b); + +#define PARTITION_LOCK(hash_table, i) \ + (&(hash_table)->control->partitions[(i)].lock) + +/* + * Create a new hash table backed by the given dynamic shared area, with the + * given parameters. The returned object is allocated in backend-local memory + * using the current MemoryContext. If 'arg' is non-null, the arg variants of + * hash and compare functions must be provided in 'params' and 'arg' will be + * passed down to them. + */ +dshash_table * +dshash_create(dsa_area *area, const dshash_parameters *params, void *arg) +{ + dshash_table *hash_table; + dsa_pointer control; + + /* Sanity checks on the set of supplied functions. */ + Assert((params->compare_function != NULL) ^ + (params->compare_arg_function != NULL)); + Assert((params->hash_function != NULL) ^ + (params->hash_arg_function != NULL)); + Assert(arg == NULL || (params->compare_arg_function != NULL)); + Assert(arg == NULL || (params->hash_arg_function != NULL)); + + /* Allocate the backend-local object representing the hash table. */ + hash_table = palloc(sizeof(dshash_table)); + + /* Allocate the control object in shared memory. */ + control = dsa_allocate(area, sizeof(dshash_table_control)); + + /* Set up the local and shared hash table structs. */ + hash_table->area = area; + hash_table->params = *params; + hash_table->arg = arg; + hash_table->control = dsa_get_address(area, control); + hash_table->control->handle = control; + hash_table->control->magic = DSHASH_MAGIC; + hash_table->control->lwlock_tranche_id = params->tranche_id; + + /* Set up the array of lock partitions. */ + { + dshash_partition *partitions = hash_table->control->partitions; + int tranche_id = hash_table->control->lwlock_tranche_id; + int i; + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + LWLockInitialize(&partitions[i].lock, tranche_id); + partitions[i].count = 0; + } + } + + hash_table->find_locked = false; + hash_table->find_exclusively_locked = false; + + /* + * Set up the initial array of buckets. Our initial size is the same as + * the number of partitions. + */ + hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2; + hash_table->control->buckets = + dsa_allocate(area, sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS); + hash_table->buckets = dsa_get_address(area, + hash_table->control->buckets); + memset(hash_table->buckets, 0, sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS); + + return hash_table; +} + +/* + * Attach to an existing hash table using a handle. The returned object is + * allocated in backend-local memory using the current MemoryContext. If + * 'arg' is non-null, the arg variants of hash and compare functions must be + * provided in 'params' and 'arg' will be passed down to them. + */ +dshash_table * +dshash_attach(dsa_area *area, const dshash_parameters *params, + dshash_table_handle handle, void *arg) +{ + dshash_table *hash_table; + dsa_pointer control; + + /* Sanity checks on the set of supplied functions. */ + Assert((params->compare_function != NULL) ^ + (params->compare_arg_function != NULL)); + Assert((params->hash_function != NULL) ^ + (params->hash_arg_function != NULL)); + Assert(arg == NULL || (params->compare_arg_function != NULL)); + Assert(arg == NULL || (params->hash_arg_function != NULL)); + + /* Allocate the backend-local object representing the hash table. */ + hash_table = palloc(sizeof(dshash_table)); + + /* Find the control object in shared memory. */ + control = handle; + + /* Set up the local hash table struct. */ + hash_table->area = area; + hash_table->params = *params; + hash_table->arg = arg; + hash_table->control = dsa_get_address(area, control); + hash_table->find_locked = false; + hash_table->find_exclusively_locked = false; + Assert(hash_table->control->magic == DSHASH_MAGIC); + + return hash_table; +} + +/* + * Detach from a hash table. This frees backend-local resources associated + * with the hash table, but the hash table will continue to exist until it is + * either explicitly destroyed (by a backend that is still attached to it), or + * the area that backs it is returned to the operating system. + */ +void +dshash_detach(dshash_table *hash_table) +{ + Assert(!hash_table->find_locked); + + /* The hash table may have been destroyed. Just free local memory. */ + pfree(hash_table); +} + +/* + * Destroy a hash table, returning all memory to the area. The caller must be + * certain that no other backend will attempt to access the hash table before + * calling this function. Other backend must explicitly call dshash_detach to + * free up backend-local memory associated with the hash table. The backend + * that calls dshash_destroy must not call dshash_detach. + */ +void +dshash_destroy(dshash_table *hash_table) +{ + size_t size; + size_t i; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ensure_valid_bucket_pointers(hash_table); + + /* Free all the entries. */ + size = 1 << hash_table->size_log2; + for (i = 0; i < size; ++i) + { + dsa_pointer item_pointer = hash_table->buckets[i]; + + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + dsa_pointer next_item_pointer; + + item = dsa_get_address(hash_table->area, item_pointer); + next_item_pointer = item->next; + dsa_free(hash_table->area, item_pointer); + item_pointer = next_item_pointer; + } + } + + /* + * Vandalize the control block to help catch programming errors where + * other backends access the memory formerly occupied by this hash table. + */ + hash_table->control->magic = 0; + + /* Free the active table and control object. */ + dsa_free(hash_table->area, hash_table->control->buckets); + dsa_free(hash_table->area, hash_table->control->handle); + + pfree(hash_table); +} + +/* + * Get a handle that can be used by other processes to attach to this hash + * table. + */ +dshash_table_handle +dshash_get_hash_table_handle(dshash_table *hash_table) +{ + Assert(hash_table->control->magic == DSHASH_MAGIC); + + return hash_table->control->handle; +} + +/* + * Look up an entry, given a key. Returns a pointer to an entry if one can be + * found with the given key. Returns NULL if the key is not found. If a + * non-NULL value is returned, the entry is locked and must be released by + * calling dshash_release_lock. If an error is raised before + * dshash_release_lock is called, the lock will be released automatically, but + * the caller must take care to ensure that the entry is not left corrupted. + * The lock mode is either shared or exclusive depending on 'exclusive'. + * + * The caller must not lock a lock already. + * + * Note that the lock held is in fact an LWLock, so interrupts will be held on + * return from this function, and not resumed until dshash_release_lock is + * called. It is a very good idea for the caller to release the lock quickly. + */ +void * +dshash_find(dshash_table *hash_table, const void *key, bool exclusive) +{ + dshash_hash hash; + size_t partition; + dshash_table_item *item; + + hash = hash_key(hash_table, key); + partition = PARTITION_FOR_HASH(hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(!hash_table->find_locked); + + LWLockAcquire(PARTITION_LOCK(hash_table, partition), + exclusive ? LW_EXCLUSIVE : LW_SHARED); + ensure_valid_bucket_pointers(hash_table); + + /* Search the active bucket. */ + item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash)); + + if (!item) + { + /* Not found. */ + LWLockRelease(PARTITION_LOCK(hash_table, partition)); + return NULL; + } + else + { + /* The caller will free the lock by calling dshash_release. */ + hash_table->find_locked = true; + hash_table->find_exclusively_locked = exclusive; + return ENTRY_FROM_ITEM(item); + } +} + +/* + * Returns a pointer to an exclusively locked item which must be released with + * dshash_release_lock. If the key is found in the hash table, 'found' is set + * to true and a pointer to the existing entry is returned. If the key is not + * found, 'found' is set to false, and a pointer to a newly created entry is + * returned. + * + * Notes above dshash_find() regarding locking and error handling equally + * apply here. + */ +void * +dshash_find_or_insert(dshash_table *hash_table, + const void *key, + bool *found) +{ + dshash_hash hash; + size_t partition_index; + dshash_partition *partition; + dshash_table_item *item; + + hash = hash_key(hash_table, key); + partition_index = PARTITION_FOR_HASH(hash); + partition = &hash_table->control->partitions[partition_index]; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(!hash_table->find_locked); + +restart: + LWLockAcquire(PARTITION_LOCK(hash_table, partition_index), + LW_EXCLUSIVE); + ensure_valid_bucket_pointers(hash_table); + + /* Search the active bucket. */ + item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash)); + + if (item) + *found = true; + else + { + *found = false; + + /* Check if we are getting too full. */ + if (partition->count > MAX_COUNT_PER_PARTITION(hash_table)) + { + /* + * The load factor (= keys / buckets) for all buckets protected by + * this partition is > 0.75. Presumably the same applies + * generally across the whole hash table (though we don't attempt + * to track that directly to avoid contention on some kind of + * central counter; we just assume that this partition is + * representative). This is a good time to resize. + * + * Give up our existing lock first, because resizing needs to + * reacquire all the locks in the right order to avoid deadlocks. + */ + LWLockRelease(PARTITION_LOCK(hash_table, partition_index)); + resize(hash_table, hash_table->size_log2 + 1); + + goto restart; + } + + /* Finally we can try to insert the new item. */ + item = insert_into_bucket(hash_table, key, + &BUCKET_FOR_HASH(hash_table, hash)); + item->hash = hash; + /* Adjust per-lock-partition counter for load factor knowledge. */ + ++partition->count; + } + + /* The caller must release the lock with dshash_release_lock. */ + hash_table->find_locked = true; + hash_table->find_exclusively_locked = true; + return ENTRY_FROM_ITEM(item); +} + +/* + * Remove an entry by key. Returns true if the key was found and the + * corresponding entry was removed. + * + * To delete an entry that you already have a pointer to, see + * dshash_delete_entry. + */ +bool +dshash_delete_key(dshash_table *hash_table, const void *key) +{ + dshash_hash hash; + size_t partition; + bool found; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(!hash_table->find_locked); + + hash = hash_key(hash_table, key); + partition = PARTITION_FOR_HASH(hash); + + LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE); + ensure_valid_bucket_pointers(hash_table); + + if (delete_key_from_bucket(hash_table, key, + &BUCKET_FOR_HASH(hash_table, hash))) + { + Assert(hash_table->control->partitions[partition].count > 0); + found = true; + --hash_table->control->partitions[partition].count; + } + else + found = false; + + LWLockRelease(PARTITION_LOCK(hash_table, partition)); + + return found; +} + +/* + * Remove an entry. The entry must already be exclusively locked, and must + * have been obtained by dshash_find or dshash_find_or_insert. Note that this + * function releases the lock just like dshash_release_lock. + * + * To delete an entry by key, see dshash_delete_key. + */ +void +dshash_delete_entry(dshash_table *hash_table, void *entry) +{ + dshash_table_item *item = ITEM_FROM_ENTRY(entry); + size_t partition = PARTITION_FOR_HASH(item->hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(hash_table->find_locked); + Assert(hash_table->find_exclusively_locked); + Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition), + LW_EXCLUSIVE)); + + delete_item(hash_table, item); + hash_table->find_locked = false; + hash_table->find_exclusively_locked = false; + LWLockRelease(PARTITION_LOCK(hash_table, partition)); +} + +/* + * Unlock an entry which was locked by dshash_find or dshash_find_or_insert. + */ +void +dshash_release_lock(dshash_table *hash_table, void *entry) +{ + dshash_table_item *item = ITEM_FROM_ENTRY(entry); + size_t partition_index = PARTITION_FOR_HASH(item->hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(hash_table->find_locked); + Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index), + hash_table->find_exclusively_locked + ? LW_EXCLUSIVE : LW_SHARED)); + + hash_table->find_locked = false; + hash_table->find_exclusively_locked = false; + LWLockRelease(PARTITION_LOCK(hash_table, partition_index)); +} + +/* + * Print debugging information about the internal state of the hash table to + * stderr. The caller must hold no partition locks. + */ +void +dshash_dump(dshash_table *hash_table) +{ + size_t i; + size_t j; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(!hash_table->find_locked); + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i))); + LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED); + } + + ensure_valid_bucket_pointers(hash_table); + + fprintf(stderr, + "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2); + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + dshash_partition *partition = &hash_table->control->partitions[i]; + size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2); + size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2); + + fprintf(stderr, " partition %zu\n", i); + fprintf(stderr, + " active buckets (key count = %zu)\n", partition->count); + + for (j = begin; j < end; ++j) + { + size_t count = 0; + dsa_pointer bucket = hash_table->buckets[j]; + + while (DsaPointerIsValid(bucket)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, bucket); + + bucket = item->next; + ++count; + } + fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count); + } + } + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + LWLockRelease(PARTITION_LOCK(hash_table, i)); +} + +/* + * Delete a locked item to which we have a pointer. + */ +static void +delete_item(dshash_table *hash_table, dshash_table_item *item) +{ + size_t hash = item->hash; + size_t partition = PARTITION_FOR_HASH(hash); + + Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition))); + + if (delete_item_from_bucket(hash_table, item, + &BUCKET_FOR_HASH(hash_table, hash))) + { + Assert(hash_table->control->partitions[partition].count > 0); + --hash_table->control->partitions[partition].count; + } + else + { + Assert(false); + } +} + +/* + * Grow the hash table if necessary to the requested number of buckets. The + * requested size must be double some previously observed size. Returns true + * if the table was successfully expanded or found to be big enough already + * (because another backend expanded it). + * + * Must be called without any partition lock held. + */ +static void +resize(dshash_table *hash_table, size_t new_size_log2) +{ + dsa_pointer old_buckets; + dsa_pointer new_buckets_shared; + dsa_pointer *new_buckets; + size_t size; + size_t new_size = 1 << new_size_log2; + size_t i; + + /* + * Acquire the locks for all lock partitions. This is expensive, but we + * shouldn't have to do it many times. + */ + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i))); + + LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE); + if (i == 0 && hash_table->control->size_log2 >= new_size_log2) + { + /* + * Another backend has already increased the size; we can avoid + * obtaining all the locks and return early. + */ + LWLockRelease(PARTITION_LOCK(hash_table, 0)); + return; + } + } + + Assert(new_size_log2 == hash_table->control->size_log2 + 1); + + /* Allocate the space for the new table. */ + new_buckets_shared = dsa_allocate0(hash_table->area, + sizeof(dsa_pointer) * new_size); + new_buckets = dsa_get_address(hash_table->area, new_buckets_shared); + + /* + * We've allocate the new bucket array; all that remains to do now is to + * reinsert all items, which amounts to adjusting all the pointers. + */ + size = 1 << hash_table->control->size_log2; + for (i = 0; i < size; ++i) + { + dsa_pointer item_pointer = hash_table->buckets[i]; + + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + dsa_pointer next_item_pointer; + + item = dsa_get_address(hash_table->area, item_pointer); + next_item_pointer = item->next; + insert_item_into_bucket(hash_table, item_pointer, item, + &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash, + new_size_log2)]); + item_pointer = next_item_pointer; + } + } + + /* Swap the hash table into place and free the old one. */ + old_buckets = hash_table->control->buckets; + hash_table->control->buckets = new_buckets_shared; + hash_table->control->size_log2 = new_size_log2; + hash_table->buckets = new_buckets; + dsa_free(hash_table->area, old_buckets); + + /* Release all the locks. */ + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + LWLockRelease(PARTITION_LOCK(hash_table, i)); +} + +/* + * Make sure that our backend-local bucket pointers are up to date. The + * caller must have locked one lock partition, which prevents resize() from + * running concurrently. + */ +static inline void +ensure_valid_bucket_pointers(dshash_table *hash_table) +{ + if (hash_table->size_log2 != hash_table->control->size_log2) + { + hash_table->buckets = dsa_get_address(hash_table->area, + hash_table->control->buckets); + hash_table->size_log2 = hash_table->control->size_log2; + } +} + +/* + * Scan a locked bucket for a match, using the provided compare function. + */ +static inline dshash_table_item * +find_in_bucket(dshash_table *hash_table, const void *key, + dsa_pointer item_pointer) +{ + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, item_pointer); + if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item))) + return item; + item_pointer = item->next; + } + return NULL; +} + +/* + * Insert an already-allocated item into a bucket. + */ +static void +insert_item_into_bucket(dshash_table *hash_table, + dsa_pointer item_pointer, + dshash_table_item *item, + dsa_pointer *bucket) +{ + Assert(item == dsa_get_address(hash_table->area, item_pointer)); + + item->next = *bucket; + *bucket = item_pointer; +} + +/* + * Allocate space for an entry with the given key and insert it into the + * provided bucket. + */ +static dshash_table_item * +insert_into_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket) +{ + dsa_pointer item_pointer; + dshash_table_item *item; + + item_pointer = dsa_allocate(hash_table->area, + hash_table->params.entry_size + + MAXALIGN(sizeof(dshash_table_item))); + item = dsa_get_address(hash_table->area, item_pointer); + memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size); + insert_item_into_bucket(hash_table, item_pointer, item, bucket); + return item; +} + +/* + * Search a bucket for a matching key and delete it. + */ +static bool +delete_key_from_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket_head) +{ + while (DsaPointerIsValid(*bucket_head)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, *bucket_head); + + if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item))) + { + dsa_pointer next; + + next = item->next; + dsa_free(hash_table->area, *bucket_head); + *bucket_head = next; + + return true; + } + bucket_head = &item->next; + } + return false; +} + +/* + * Delete the specified item from the bucket. + */ +static bool +delete_item_from_bucket(dshash_table *hash_table, + dshash_table_item *item, + dsa_pointer *bucket_head) +{ + while (DsaPointerIsValid(*bucket_head)) + { + dshash_table_item *bucket_item; + + bucket_item = dsa_get_address(hash_table->area, *bucket_head); + + if (bucket_item == item) + { + dsa_pointer next; + + next = item->next; + dsa_free(hash_table->area, *bucket_head); + *bucket_head = next; + return true; + } + bucket_head = &bucket_item->next; + } + return false; +} + +/* + * Compute the hash value for a key. + */ +static inline dshash_hash +hash_key(dshash_table *hash_table, const void *key) +{ + if (hash_table->params.hash_arg_function != NULL) + return hash_table->params.hash_arg_function(key, hash_table->arg); + else + return hash_table->params.hash_function(key, + hash_table->params.key_size); +} + +/* + * Check whether two keys compare equal. + */ +static inline bool +equal_keys(dshash_table *hash_table, const void *a, const void *b) +{ + int r; + + if (hash_table->params.compare_arg_function != NULL) + r = hash_table->params.compare_arg_function(a, b, hash_table->arg); + else + r = hash_table->params.compare_function(a, b, + hash_table->params.key_size); + + return r == 0; +} diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h new file mode 100644 index 0000000000..187f58b4e4 --- /dev/null +++ b/src/include/lib/dshash.h @@ -0,0 +1,107 @@ +/*------------------------------------------------------------------------- + * + * dshash.h + * Concurrent hash tables backed by dynamic shared memory areas. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/lib/dshash.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSHASH_H +#define DSHASH_H + +#include "utils/dsa.h" + +/* The opaque type representing a hash table. */ +struct dshash_table; +typedef struct dshash_table dshash_table; + +/* A handle for a dshash_table which can be shared with other processes. */ +typedef dsa_pointer dshash_table_handle; + +/* The type for hash values. */ +typedef uint32 dshash_hash; + +/* + * A function used for comparing keys. This version is compatible with + * HashCompareFunction in hsearch.h and memcmp. + */ +typedef int (*dshash_compare_function) (const void *a, const void *b, size_t size); + +/* + * A function type used for comparing keys. This version allows compare + * functions to receive a pointer to arbitrary user data that was given to the + * create or attach function. Similar to qsort_arg_comparator. + */ +typedef int (*dshash_compare_arg_function) (const void *a, const void *b, void *arg); + +/* + * A function type for computing hash values for keys. This version is + * compatible with HashValueFunc in hsearch.h and hash functions like + * tag_hash. + */ +typedef dshash_hash (*dshash_hash_function) (const void *v, size_t size); + +/* + * A function type for computing hash values for keys. This version allows + * hash functions to receive a pointer to arbitrary user data that was given + * to the create or attach function. + */ +typedef dshash_hash (*dshash_hash_arg_function) (const void *v, void *arg); + +/* + * The set of parameters needed to create or attach to a hash table. The + * members tranche_id and tranche_name do not need to be initialized when + * attaching to an existing hash table. + * + * Compare and hash functions mus be supplied even when attaching, because we + * can't safely share function pointers between backends in general. Either + * the arg variants or the non-arg variants should be supplied; the other + * function pointers should be NULL. If the arg varants are supplied then the + * user data pointer supplied to the create and attach functions will be + * passed to the hash and compare functions. + */ +typedef struct dshash_parameters +{ + size_t key_size; /* Size of the key (initial bytes of entry) */ + size_t entry_size; /* Total size of entry */ + dshash_compare_function compare_function; /* Compare function */ + dshash_compare_arg_function compare_arg_function; /* Arg version */ + dshash_hash_function hash_function; /* Hash function */ + dshash_hash_arg_function hash_arg_function; /* Arg version */ + int tranche_id; /* The tranche ID to use for locks */ +} dshash_parameters; + +/* Forward declaration of private types for use only by dht.c. */ +struct dshash_table_item; +typedef struct dshash_table_item dshash_table_item; + +/* Creating, sharing and destroying from hash tables. */ +extern dshash_table *dshash_create(dsa_area *area, + const dshash_parameters *params, + void *arg); +extern dshash_table *dshash_attach(dsa_area *area, + const dshash_parameters *params, + dshash_table_handle handle, + void *arg); +extern void dshash_detach(dshash_table *hash_table); +extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table); +extern void dshash_destroy(dshash_table *hash_table); + +/* Finding, creating, deleting entries. */ +extern void *dshash_find(dshash_table *hash_table, + const void *key, bool exclusive); +extern void *dshash_find_or_insert(dshash_table *hash_table, + const void *key, bool *found); +extern bool dshash_delete_key(dshash_table *hash_table, const void *key); +extern void dshash_delete_entry(dshash_table *hash_table, void *entry); +extern void dshash_release_lock(dshash_table *hash_table, void *entry); + +/* Debugging support. */ +extern void dshash_dump(dshash_table *hash_table); + +#endif /* DSHASH_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a4ace383fa..17ba2bde5c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2604,6 +2604,13 @@ dsa_pointer dsa_segment_header dsa_segment_index dsa_segment_map +dshash_hash +dshash_parameters +dshash_partition +dshash_table +dshash_table_control +dshash_table_handle +dshash_table_item dsm_control_header dsm_control_item dsm_handle -- 2.40.0