From 44ca4022f3f9297bab5cbffdd97973dbba1879ed Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 23 Mar 2016 10:56:23 -0400 Subject: [PATCH] Partition the freelist for shared dynahash tables. Without this, contention on the freelist can become a pretty serious problem on large servers. Aleksander Alekseev, reviewed by Anastasia Lubennikova, Dilip Kumar, and me. --- src/backend/utils/hash/dynahash.c | 184 ++++++++++++++++++++++-------- 1 file changed, 137 insertions(+), 47 deletions(-) diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 24a53da2f4..679ca775c6 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -15,7 +15,7 @@ * to hash_create. This prevents any attempt to split buckets on-the-fly. * Therefore, each hash bucket chain operates independently, and no fields * of the hash header change after init except nentries and freeList. - * A partitioned table uses a spinlock to guard changes of those two fields. + * A partitioned table uses spinlocks to guard changes of those fields. * This lets any subset of the hash buckets be treated as a separately * lockable partition. We expect callers to use the low-order bits of a * lookup key's hash value as a partition number --- this will work because @@ -111,6 +111,8 @@ #define DEF_DIRSIZE 256 #define DEF_FFACTOR 1 /* default fill factor */ +/* Number of freelists to be used for a partitioned hash table. */ +#define NUM_FREELISTS 32 /* A hash bucket is a linked list of HASHELEMENTs */ typedef HASHELEMENT *HASHBUCKET; @@ -118,6 +120,18 @@ typedef HASHELEMENT *HASHBUCKET; /* A hash segment is an array of bucket headers */ typedef HASHBUCKET *HASHSEGMENT; +/* + * Using array of FreeListData instead of separate arrays of mutexes, nentries + * and freeLists prevents, at least partially, sharing one cache line between + * different mutexes (see below). + */ +typedef struct +{ + slock_t mutex; /* spinlock */ + long nentries; /* number of entries */ + HASHELEMENT *freeList; /* list of free elements */ +} FreeListData; + /* * Header structure for a hash table --- contains all changeable info * @@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT; */ struct HASHHDR { - /* In a partitioned table, take this lock to touch nentries or freeList */ - slock_t mutex; /* unused if not partitioned table */ - - /* These fields change during entry addition/deletion */ - long nentries; /* number of entries in hash table */ - HASHELEMENT *freeList; /* linked list of free elements */ + /* + * The freelist can become a point of contention on high-concurrency hash + * tables, so we use an array of freelist, each with its own mutex and + * nentries count, instead of just a single one. + * + * If hash table is not partitioned only freeList[0] is used and spinlocks + * are not used at all. + */ + FreeListData freeList[NUM_FREELISTS]; /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ @@ -166,6 +183,9 @@ struct HASHHDR #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) +#define FREELIST_IDX(hctl, hashcode) \ + (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0) + /* * Top control structure for a hashtable --- in a shared table, each backend * has its own copy (OK since no fields change at runtime) @@ -219,10 +239,10 @@ static long hash_accesses, */ static void *DynaHashAlloc(Size size); static HASHSEGMENT seg_alloc(HTAB *hashp); -static bool element_alloc(HTAB *hashp, int nelem); +static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); -static HASHBUCKET get_hash_entry(HTAB *hashp); +static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); @@ -482,10 +502,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { - if (!element_alloc(hashp, (int) nelem)) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + int i, + freelist_partitions, + nelem_alloc, + nelem_alloc_first; + + /* + * If hash table is partitioned all freeLists have equal number of + * elements. Otherwise only freeList[0] is used. + */ + if (IS_PARTITIONED(hashp->hctl)) + freelist_partitions = NUM_FREELISTS; + else + freelist_partitions = 1; + + nelem_alloc = nelem / freelist_partitions; + if (nelem_alloc == 0) + nelem_alloc = 1; + + /* Make sure all memory will be used */ + if (nelem_alloc * freelist_partitions < nelem) + nelem_alloc_first = + nelem - nelem_alloc * (freelist_partitions - 1); + else + nelem_alloc_first = nelem_alloc; + + for (i = 0; i < freelist_partitions; i++) + { + int temp = (i == 0) ? nelem_alloc_first : nelem_alloc; + + if (!element_alloc(hashp, temp, i)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } } if (flags & HASH_FIXED_SIZE) @@ -503,9 +553,6 @@ hdefault(HTAB *hashp) MemSet(hctl, 0, sizeof(HASHHDR)); - hctl->nentries = 0; - hctl->freeList = NULL; - hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; @@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem) HASHSEGMENT *segp; int nbuckets; int nsegs; + int i; /* * initialize mutex if it's a partitioned table */ if (IS_PARTITIONED(hctl)) - SpinLockInit(&hctl->mutex); + for (i = 0; i < NUM_FREELISTS; i++) + SpinLockInit(&(hctl->freeList[i].mutex)); /* * Divide number of elements by the fill factor to determine a desired @@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem) "HIGH MASK ", hctl->high_mask, "LOW MASK ", hctl->low_mask, "NSEGS ", hctl->nsegs, - "NENTRIES ", hctl->nentries); + "NENTRIES ", hash_get_num_entries(hctl)); #endif return true; } @@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp) where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hashp->hctl->nentries, (long) hashp->hctl->keysize, + hash_get_num_entries(hashp), (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); @@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp, HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; + int freelist_idx = FREELIST_IDX(hctl, hashvalue); #if HASH_STATISTICS hash_accesses++; @@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp, * order of these tests is to try to check cheaper conditions first. */ if (!IS_PARTITIONED(hctl) && !hashp->frozen && - hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && !has_seq_scans(hashp)) (void) expand_table(hashp); } @@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp, { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); - Assert(hctl->nentries > 0); - hctl->nentries--; + Assert(hctl->freeList[freelist_idx].nentries > 0); + hctl->freeList[freelist_idx].nentries--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; /* add the record to the freelist for this table. */ - currBucket->link = hctl->freeList; - hctl->freeList = currBucket; + currBucket->link = hctl->freeList[freelist_idx].freeList; + hctl->freeList[freelist_idx].freeList = currBucket; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); /* * better hope the caller is synchronizing access to this @@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp, elog(ERROR, "cannot insert into frozen hashtable \"%s\"", hashp->tabname); - currBucket = get_hash_entry(hashp); + currBucket = get_hash_entry(hashp, freelist_idx); if (currBucket == NULL) { /* out of memory */ @@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp, * create a new entry if possible */ static HASHBUCKET -get_hash_entry(HTAB *hashp) +get_hash_entry(HTAB *hashp, int freelist_idx) { - HASHHDR *hctl = hashp->hctl; + HASHHDR *hctl = hashp->hctl; HASHBUCKET newElement; + int borrow_from_idx; for (;;) { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); /* try to get an entry from the freelist */ - newElement = hctl->freeList; + newElement = hctl->freeList[freelist_idx].freeList; + if (newElement != NULL) break; - /* no free elements. allocate another chunk of buckets */ if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); - if (!element_alloc(hashp, hctl->nelem_alloc)) + /* no free elements. allocate another chunk of buckets */ + if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) { - /* out of memory */ - return NULL; + if (!IS_PARTITIONED(hctl)) + return NULL; /* out of memory */ + + /* try to borrow element from another partition */ + borrow_from_idx = freelist_idx; + for (;;) + { + borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; + if (borrow_from_idx == freelist_idx) + break; + + SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); + newElement = hctl->freeList[borrow_from_idx].freeList; + + if (newElement != NULL) + { + hctl->freeList[borrow_from_idx].freeList = newElement->link; + SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); + + SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); + hctl->freeList[freelist_idx].nentries++; + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); + + break; + } + + SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); + } + + return newElement; } } /* remove entry from freelist, bump nentries */ - hctl->freeList = newElement->link; - hctl->nentries++; + hctl->freeList[freelist_idx].freeList = newElement->link; + hctl->freeList[freelist_idx].nentries++; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); return newElement; } @@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp) long hash_get_num_entries(HTAB *hashp) { + int i; + long sum = hashp->hctl->freeList[0].nentries; + /* * We currently don't bother with the mutex; it's only sensible to call * this function if you've got lock on all partitions of the table. */ - return hashp->hctl->nentries; + + if (!IS_PARTITIONED(hashp->hctl)) + return sum; + + for (i = 1; i < NUM_FREELISTS; i++) + sum += hashp->hctl->freeList[i].nentries; + + return sum; } /* @@ -1527,12 +1617,12 @@ seg_alloc(HTAB *hashp) } /* - * allocate some new elements and link them into the free list + * allocate some new elements and link them into the indicated free list */ static bool -element_alloc(HTAB *hashp, int nelem) +element_alloc(HTAB *hashp, int nelem, int freelist_idx) { - HASHHDR *hctl = hashp->hctl; + HASHHDR *hctl = hashp->hctl; Size elementSize; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; @@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem) /* if partitioned, must lock to touch freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); /* freelist could be nonempty if two backends did this concurrently */ - firstElement->link = hctl->freeList; - hctl->freeList = prevElement; + firstElement->link = hctl->freeList[freelist_idx].freeList; + hctl->freeList[freelist_idx].freeList = prevElement; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); return true; } -- 2.40.0