Add support to dynahash.c for partitioning shared hashtables according

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c

index 3bce41d4abbc25134879577687d9399e49a1e5da..6a65429f0fcc24804c168a143d5b671f0df8a760 100644 (file)
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.93 2006/07/14 14:52:22 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.94 2006/07/22 23:04:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -211,9 +211,6 @@ InitShmemIndex(void)
  {
         HASHCTL         info;
         int                     hash_flags;
-       ShmemIndexEnt *result,
-                               item;
-       bool            found;
  
         /*
          * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
@@ -227,32 +224,11 @@ InitShmemIndex(void)
         info.entrysize = sizeof(ShmemIndexEnt);
         hash_flags = HASH_ELEM;
  
-       /* This will acquire the shmem index lock, but not release it. */
         ShmemIndex = ShmemInitHash("ShmemIndex",
                                                            SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
                                                            &info, hash_flags);
         if (!ShmemIndex)
                 elog(FATAL, "could not initialize Shmem Index");
-
-       /*
-        * Now, create an entry in the hashtable for the index itself.
-        */
-       if (!IsUnderPostmaster)
-       {
-               MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
-               strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
-
-               result = (ShmemIndexEnt *)
-                       hash_search(ShmemIndex, (void *) &item, HASH_ENTER, &found);
-
-               Assert(!found);
-
-               result->location = MAKE_OFFSET(ShmemIndex->hctl);
-               result->size = SHMEM_INDEX_SIZE;
-       }
-
-       /* now release the lock acquired in ShmemInitStruct */
-       LWLockRelease(ShmemIndexLock);
  }
  
  /*
@@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
  
         /* look it up in the shmem index */
         location = ShmemInitStruct(name,
-                                               sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT),
+                                                          hash_get_shared_size(infoP, hash_flags),
                                                            &found);
  
         /*
@@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
         if (found)
                 hash_flags |= HASH_ATTACH;
  
-       /* Now provide the header and directory pointers */
+       /* Pass location of hashtable header to hash_create */
         infoP->hctl = (HASHHDR *) location;
-       infoP->dir = (HASHSEGMENT *) (((char *) location) + sizeof(HASHHDR));
  
         return hash_create(name, init_size, infoP, hash_flags);
  }
@@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
                          * If the shmem index doesn't exist, we are bootstrapping: we must
                          * be trying to init the shmem index itself.
                          *
-                        * Notice that the ShmemIndexLock is held until the shmem index
-                        * has been completely initialized.
+                        * Notice that the ShmemIndexLock is released before the shmem
+                        * index has been initialized.  This should be OK because no
+                        * other process can be accessing shared memory yet.
                          */
                         Assert(shmemseghdr->indexoffset == 0);
                         structPtr = ShmemAlloc(size);
                         shmemseghdr->indexoffset = MAKE_OFFSET(structPtr);
                         *foundPtr = FALSE;
                 }
+               LWLockRelease(ShmemIndexLock);
                 return structPtr;
         }
  
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c

index 472b3d1f4b3de282a457afa0cfaed323c4b5b3fb..cf78869a0b2d12d30ee3a42ca632c7143380a594 100644 (file)
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.166 2006/07/14 14:52:23 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.167 2006/07/22 23:04:39 tgl Exp $
   *
   * NOTES
   *       A lock table is a shared memory hash table.  When
@@ -1958,7 +1958,7 @@ GetLockStatusData(void)
         {
                 LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
                 proclockTable = LockMethodProcLockHash[i];
-               els += proclockTable->hctl->nentries;
+               els += hash_get_num_entries(proclockTable);
         }
  
         data->nelements = els;
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index f98b94942315c902bb3b5dced47903b9eed9843e..23200667afd5de07326803781c249ada240e7b02 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -3,17 +3,36 @@
   * dynahash.c
   *       dynamic hash tables
   *
+ * dynahash.c supports both local-to-a-backend hash tables and hash tables in
+ * shared memory.  For shared hash tables, it is the caller's responsibility
+ * to provide appropriate access interlocking.  The simplest convention is
+ * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
+ * hash_seq_search) need only shared lock, but any update requires exclusive
+ * lock.  For heavily-used shared tables, the single-lock approach creates a
+ * concurrency bottleneck, so we also support "partitioned" locking wherein
+ * there are multiple LWLocks guarding distinct subsets of the table.  To use
+ * a hash table in partitioned mode, the HASH_PARTITION flag must be given
+ * to hash_create.  This prevents any attempt to split buckets on-the-fly.
+ * Therefore, each hash bucket chain operates independently, and no fields
+ * of the hash header change after init except nentries and freeList.
+ * A partitioned table uses a spinlock to guard changes of those two fields.
+ * This lets any subset of the hash buckets be treated as a separately
+ * lockable partition.  We expect callers to use the low-order bits of a
+ * lookup key's hash value as a partition number --- this will work because
+ * of the way calc_bucket() maps hash values to bucket numbers.
   *
   * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.69 2006/07/14 14:52:25 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.70 2006/07/22 23:04:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
+
  /*
+ * Original comments:
   *
   * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
   * Coded into C, with minor code improvements, and with hsearch(3) interface,
@@ -45,9 +64,107 @@
  #include "postgres.h"
  
  #include "storage/shmem.h"
+#include "storage/spin.h"
  #include "utils/dynahash.h"
  #include "utils/memutils.h"
  
+
+/*
+ * Constants
+ *
+ * A hash table has a top-level "directory", each of whose entries points
+ * to a "segment" of ssize bucket headers.     The maximum number of hash
+ * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
+ * the number of records in the table can be larger, but we don't want a
+ * whole lot of records per bucket or performance goes down.
+ *
+ * In a hash table allocated in shared memory, the directory cannot be
+ * expanded because it must stay at a fixed address.  The directory size
+ * should be selected using hash_select_dirsize (and you'd better have
+ * a good idea of the maximum number of entries!).     For non-shared hash
+ * tables, the initial directory size can be left at the default.
+ */
+#define DEF_SEGSIZE                       256
+#define DEF_SEGSIZE_SHIFT         8    /* must be log2(DEF_SEGSIZE) */
+#define DEF_DIRSIZE                       256
+#define DEF_FFACTOR                       1    /* default fill factor */
+
+
+/* A hash bucket is a linked list of HASHELEMENTs */
+typedef HASHELEMENT *HASHBUCKET;
+
+/* A hash segment is an array of bucket headers */
+typedef HASHBUCKET *HASHSEGMENT;
+
+/*
+ * Header structure for a hash table --- contains all changeable info
+ *
+ * In a shared-memory hash table, the HASHHDR is in shared memory, while
+ * each backend has a local HTAB struct.  For a non-shared table, there isn't
+ * any functional difference between HASHHDR and HTAB, but we separate them
+ * anyway to share code between shared and non-shared tables.
+ */
+struct HASHHDR
+{
+       /* In a partitioned table, take this lock to touch nentries or freeList */
+       slock_t         mutex;                  /* unused if not partitioned table */
+
+       /* These fields change during entry addition/deletion */
+       long            nentries;               /* number of entries in hash table */
+       HASHELEMENT *freeList;          /* linked list of free elements */
+
+       /* These fields can change, but not in a partitioned table */
+       /* Also, dsize can't change in a shared table, even if unpartitioned */
+       long            dsize;                  /* directory size */
+       long            nsegs;                  /* number of allocated segments (<= dsize) */
+       uint32          max_bucket;             /* ID of maximum bucket in use */
+       uint32          high_mask;              /* mask to modulo into entire table */
+       uint32          low_mask;               /* mask to modulo into lower half of table */
+
+       /* These fields are fixed at hashtable creation */
+       Size            keysize;                /* hash key length in bytes */
+       Size            entrysize;              /* total user element size in bytes */
+       long            num_partitions; /* # partitions (must be power of 2), or 0 */
+       long            ffactor;                /* target fill factor */
+       long            max_dsize;              /* 'dsize' limit if directory is fixed size */
+       long            ssize;                  /* segment size --- must be power of 2 */
+       int                     sshift;                 /* segment shift = log2(ssize) */
+       int                     nelem_alloc;    /* number of entries to allocate at once */
+
+#ifdef HASH_STATISTICS
+       /*
+        * Count statistics here.  NB: stats code doesn't bother with mutex,
+        * so counts could be corrupted a bit in a partitioned table.
+        */
+       long            accesses;
+       long            collisions;
+#endif
+};
+
+#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
+
+/*
+ * Top control structure for a hashtable --- in a shared table, each backend
+ * has its own copy (OK since no fields change at runtime)
+ */
+struct HTAB
+{
+       HASHHDR    *hctl;                       /* => shared control information */
+       HASHSEGMENT *dir;                       /* directory of segment starts */
+       HashValueFunc hash;                     /* hash function */
+       HashCompareFunc match;          /* key comparison function */
+       HashCopyFunc keycopy;           /* key copying function */
+       HashAllocFunc alloc;            /* memory allocator */
+       MemoryContext hcxt;                     /* memory context if default allocator used */
+       char       *tabname;            /* table name (for error messages) */
+       bool            isshared;               /* true if table is in shared memory */
+
+       /* We keep local copies of these fixed values to reduce contention */
+       Size            keysize;                /* hash key length in bytes */
+       long            ssize;                  /* segment size --- must be power of 2 */
+       int                     sshift;                 /* segment shift = log2(ssize) */
+};
+
  /*
   * Key (also entry) part of a HASHELEMENT
   */
@@ -58,6 +175,12 @@
   */
  #define MOD(x,y)                          ((x) & ((y)-1))
  
+#if HASH_STATISTICS
+static long hash_accesses,
+                       hash_collisions,
+                       hash_expansions;
+#endif
+
  /*
   * Private function prototypes
   */
@@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp);
  static bool element_alloc(HTAB *hashp, int nelem);
  static bool dir_realloc(HTAB *hashp);
  static bool expand_table(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp);
  static void hdefault(HTAB *hashp);
  static int     choose_nelem_alloc(Size entrysize);
  static bool init_htab(HTAB *hashp, long nelem);
@@ -85,13 +209,6 @@ DynaHashAlloc(Size size)
  }
  
  
-#if HASH_STATISTICS
-static long hash_accesses,
-                       hash_collisions,
-                       hash_expansions;
-#endif
-
-
  /************************** CREATE ROUTINES **********************/
  
  /*
@@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
         if (flags & HASH_SHARED_MEM)
         {
                 /*
-                * ctl structure is preallocated for shared memory tables. Note that
-                * HASH_DIRSIZE and HASH_ALLOC had better be set as well.
+                * ctl structure and directory are preallocated for shared memory
+                * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set
+                * as well.
                  */
                 hashp->hctl = info->hctl;
-               hashp->dir = info->dir;
+               hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
                 hashp->hcxt = NULL;
                 hashp->isshared = true;
  
                 /* hash table already exists, we're just attaching to it */
                 if (flags & HASH_ATTACH)
+               {
+                       /* make local copies of some heavily-used values */
+                       hctl = hashp->hctl;
+                       hashp->keysize = hctl->keysize;
+                       hashp->ssize = hctl->ssize;
+                       hashp->sshift = hctl->sshift;
+
                         return hashp;
+               }
         }
         else
         {
@@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
         hdefault(hashp);
  
         hctl = hashp->hctl;
-#ifdef HASH_STATISTICS
-       hctl->accesses = hctl->collisions = 0;
-#endif
+
+       if (flags & HASH_PARTITION)
+       {
+               /* Doesn't make sense to partition a local hash table */
+               Assert(flags & HASH_SHARED_MEM);
+               /* # of partitions had better be a power of 2 */
+               Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
+
+               hctl->num_partitions = info->num_partitions;
+       }
  
         if (flags & HASH_SEGMENT)
         {
@@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                 hctl->entrysize = info->entrysize;
         }
  
+       /* make local copies of heavily-used constant fields */
+       hashp->keysize = hctl->keysize;
+       hashp->ssize = hctl->ssize;
+       hashp->sshift = hctl->sshift;
+
         /* Build the hash directory structure */
         if (!init_htab(hashp, nelem))
         {
@@ -292,22 +430,29 @@ hdefault(HTAB *hashp)
  
         MemSet(hctl, 0, sizeof(HASHHDR));
  
-       hctl->ssize = DEF_SEGSIZE;
-       hctl->sshift = DEF_SEGSIZE_SHIFT;
-       hctl->dsize = DEF_DIRSIZE;
-       hctl->ffactor = DEF_FFACTOR;
         hctl->nentries = 0;
+       hctl->freeList = NULL;
+
+       hctl->dsize = DEF_DIRSIZE;
         hctl->nsegs = 0;
  
         /* rather pointless defaults for key & entry size */
         hctl->keysize = sizeof(char *);
         hctl->entrysize = 2 * sizeof(char *);
  
+       hctl->num_partitions = 0;       /* not partitioned */
+
+       hctl->ffactor = DEF_FFACTOR;
+
         /* table has no fixed maximum size */
         hctl->max_dsize = NO_MAX_DSIZE;
  
-       /* garbage collection for HASH_REMOVE */
-       hctl->freeList = NULL;
+       hctl->ssize = DEF_SEGSIZE;
+       hctl->sshift = DEF_SEGSIZE_SHIFT;
+
+#ifdef HASH_STATISTICS
+       hctl->accesses = hctl->collisions = 0;
+#endif
  }
  
  /*
@@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize)
         return nelem_alloc;
  }
  
+/*
+ * Compute derived fields of hctl and build the initial directory/segment
+ * arrays
+ */
  static bool
  init_htab(HTAB *hashp, long nelem)
  {
@@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem)
         int                     nbuckets;
         int                     nsegs;
  
+       /*
+        * initialize mutex if it's a partitioned table
+        */
+       if (IS_PARTITIONED(hctl))
+               SpinLockInit(&hctl->mutex);
+
         /*
          * Divide number of elements by the fill factor to determine a desired
          * number of buckets.  Allocate space for the next greater power of two
@@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem)
  
         nbuckets = 1 << my_log2(lnbuckets);
  
+       /*
+        * In a partitioned table, nbuckets must be at least equal to
+        * num_partitions; were it less, keys with apparently different partition
+        * numbers would map to the same bucket, breaking partition independence.
+        * (Normally nbuckets will be much bigger; this is just a safety check.)
+        */
+       while (nbuckets < hctl->num_partitions)
+               nbuckets <<= 1;
+
         hctl->max_bucket = hctl->low_mask = nbuckets - 1;
         hctl->high_mask = (nbuckets << 1) - 1;
  
@@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries)
         return nDirEntries;
  }
  
+/*
+ * Compute the required initial memory allocation for a shared-memory
+ * hashtable with the given parameters.  We need space for the HASHHDR
+ * and for the (non expansible) directory.
+ */
+Size
+hash_get_shared_size(HASHCTL *info, int flags)
+{
+       Assert(flags & HASH_DIRSIZE);
+       Assert(info->dsize == info->max_dsize);
+       return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+}
+
  
  /********************** DESTROY ROUTINES ************************/
  
@@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp)
                         where, hashp->hctl->accesses, hashp->hctl->collisions);
  
         fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-                       hashp->hctl->nentries, hashp->hctl->keysize,
+                       hashp->hctl->nentries, (long) hashp->hctl->keysize,
                         hashp->hctl->max_bucket, hashp->hctl->nsegs);
         fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
                         where, hash_accesses, hash_collisions);
@@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp)
  /*******************************SEARCH ROUTINES *****************************/
  
  
+/*
+ * get_hash_value -- exported routine to calculate a key's hash value
+ *
+ * We export this because for partitioned tables, callers need to compute
+ * the partition number (from the low-order bits of the hash value) before
+ * searching.
+ */
+uint32
+get_hash_value(HTAB *hashp, const void *keyPtr)
+{
+       return hashp->hash(keyPtr, hashp->keysize);
+}
+
  /* Convert a hash value to a bucket number */
  static inline uint32
  calc_bucket(HASHHDR *hctl, uint32 hash_val)
@@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
         return bucket;
  }
  
-/*----------
+/*
   * hash_search -- look up key in table and perform action
+ * hash_search_with_hash_value -- same, with key's hash value already computed
   *
   * action is one of:
   *             HASH_FIND: look up key in table
@@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
   * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
   * existing entry in the table, FALSE otherwise.  This is needed in the
   * HASH_ENTER case, but is redundant with the return value otherwise.
- *----------
+ *
+ * For hash_search_with_hash_value, the hashvalue parameter must have been
+ * calculated with get_hash_value().
   */
  void *
  hash_search(HTAB *hashp,
                         const void *keyPtr,
                         HASHACTION action,
                         bool *foundPtr)
+{
+       return hash_search_with_hash_value(hashp,
+                                                                          keyPtr,
+                                                                          hashp->hash(keyPtr, hashp->keysize),
+                                                                          action,
+                                                                          foundPtr);
+}
+
+void *
+hash_search_with_hash_value(HTAB *hashp,
+                                                       const void *keyPtr,
+                                                       uint32 hashvalue,
+                                                       HASHACTION action,
+                                                       bool *foundPtr)
  {
         HASHHDR    *hctl = hashp->hctl;
-       Size            keysize = hctl->keysize;
-       uint32          hashvalue;
+       Size            keysize;
         uint32          bucket;
         long            segment_num;
         long            segment_ndx;
@@ -595,11 +801,10 @@ hash_search(HTAB *hashp,
         /*
          * Do the initial lookup
          */
-       hashvalue = hashp->hash(keyPtr, keysize);
         bucket = calc_bucket(hctl, hashvalue);
  
-       segment_num = bucket >> hctl->sshift;
-       segment_ndx = MOD(bucket, hctl->ssize);
+       segment_num = bucket >> hashp->sshift;
+       segment_ndx = MOD(bucket, hashp->ssize);
  
         segp = hashp->dir[segment_num];
  
@@ -613,6 +818,7 @@ hash_search(HTAB *hashp,
          * Follow collision chain looking for matching key
          */
         match = hashp->match;           /* save one fetch in inner loop */
+       keysize = hashp->keysize;       /* ditto */
  
         while (currBucket != NULL)
         {
@@ -643,15 +849,25 @@ hash_search(HTAB *hashp,
                 case HASH_REMOVE:
                         if (currBucket != NULL)
                         {
-                               Assert(hctl->nentries > 0);
-                               hctl->nentries--;
+                               /* use volatile pointer to prevent code rearrangement */
+                               volatile HASHHDR *hctlv = hctl;
+
+                               /* if partitioned, must lock to touch nentries and freeList */
+                               if (IS_PARTITIONED(hctlv))
+                                       SpinLockAcquire(&hctlv->mutex);
+
+                               Assert(hctlv->nentries > 0);
+                               hctlv->nentries--;
  
                                 /* remove record from hash bucket's chain. */
                                 *prevBucketPtr = currBucket->link;
  
                                 /* add the record to the freelist for this table.  */
-                               currBucket->link = hctl->freeList;
-                               hctl->freeList = currBucket;
+                               currBucket->link = hctlv->freeList;
+                               hctlv->freeList = currBucket;
+
+                               if (IS_PARTITIONED(hctlv))
+                                       SpinLockRelease(&hctlv->mutex);
  
                                 /*
                                  * better hope the caller is synchronizing access to this
@@ -672,32 +888,23 @@ hash_search(HTAB *hashp,
                         if (currBucket != NULL)
                                 return (void *) ELEMENTKEY(currBucket);
  
-                       /* get the next free element */
-                       currBucket = hctl->freeList;
+                       currBucket = get_hash_entry(hashp);
                         if (currBucket == NULL)
                         {
-                               /* no free elements.  allocate another chunk of buckets */
-                               if (!element_alloc(hashp, hctl->nelem_alloc))
-                               {
-                                       /* out of memory */
-                                       if (action == HASH_ENTER_NULL)
-                                               return NULL;
-                                       /* report a generic message */
-                                       if (hashp->isshared)
-                                               ereport(ERROR,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of shared memory")));
-                                       else
-                                               ereport(ERROR,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of memory")));
-                               }
-                               currBucket = hctl->freeList;
-                               Assert(currBucket != NULL);
+                               /* out of memory */
+                               if (action == HASH_ENTER_NULL)
+                                       return NULL;
+                               /* report a generic message */
+                               if (hashp->isshared)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                                        errmsg("out of shared memory")));
+                               else
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                                        errmsg("out of memory")));
                         }
  
-                       hctl->freeList = currBucket->link;
-
                         /* link into hashbucket chain */
                         *prevBucketPtr = currBucket;
                         currBucket->link = NULL;
@@ -708,8 +915,10 @@ hash_search(HTAB *hashp,
  
                         /* caller is expected to fill the data field on return */
  
-                       /* Check if it is time to split the segment */
-                       if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+                       /* Check if it is time to split a bucket */
+                       /* Can't split if running in partitioned mode */
+                       if (!IS_PARTITIONED(hctl) &&
+                               hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
                         {
                                 /*
                                  * NOTE: failure to expand table is not a fatal error, it just
@@ -726,6 +935,61 @@ hash_search(HTAB *hashp,
         return NULL;                            /* keep compiler quiet */
  }
  
+/*
+ * create a new entry if possible
+ */
+static HASHBUCKET
+get_hash_entry(HTAB *hashp)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile HASHHDR *hctlv = hashp->hctl;
+       HASHBUCKET      newElement;
+
+       for (;;)
+       {
+               /* if partitioned, must lock to touch nentries and freeList */
+               if (IS_PARTITIONED(hctlv))
+                       SpinLockAcquire(&hctlv->mutex);
+
+               /* try to get an entry from the freelist */
+               newElement = hctlv->freeList;
+               if (newElement != NULL)
+                       break;
+
+               /* no free elements.  allocate another chunk of buckets */
+               if (IS_PARTITIONED(hctlv))
+                       SpinLockRelease(&hctlv->mutex);
+
+               if (!element_alloc(hashp, hctlv->nelem_alloc))
+               {
+                       /* out of memory */
+                       return NULL;
+               }
+       }
+
+       /* remove entry from freelist, bump nentries */
+       hctlv->freeList = newElement->link;
+       hctlv->nentries++;
+
+       if (IS_PARTITIONED(hctlv))
+               SpinLockRelease(&hctlv->mutex);
+
+       return newElement;
+}
+
+/*
+ * hash_get_num_entries -- get the number of entries in a hashtable
+ */
+long
+hash_get_num_entries(HTAB *hashp)
+{
+       /*
+        * We currently don't bother with the mutex; it's only sensible to call
+        * this function if you've got lock on all partitions of the table.
+        */
+       return hashp->hctl->nentries;
+}
+
  /*
   * hash_seq_init/_search
   *                     Sequentially search through hash table and return
@@ -736,6 +1000,9 @@ hash_search(HTAB *hashp,
   * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
   * if elements are added to the table while the scan is in progress, it is
   * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: to use this with a partitioned hashtable, caller had better hold
+ * at least shared lock on all partitions of the table throughout the scan!
   */
  void
  hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         curBucket = status->curBucket;
         hashp = status->hashp;
         hctl = hashp->hctl;
-       ssize = hctl->ssize;
+       ssize = hashp->ssize;
         max_bucket = hctl->max_bucket;
  
         if (curBucket > max_bucket)
@@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         /*
          * first find the right segment in the table directory.
          */
-       segment_num = curBucket >> hctl->sshift;
+       segment_num = curBucket >> hashp->sshift;
         segment_ndx = MOD(curBucket, ssize);
  
         segp = hashp->dir[segment_num];
@@ -840,13 +1107,15 @@ expand_table(HTAB *hashp)
         HASHBUCKET      currElement,
                                 nextElement;
  
+       Assert(!IS_PARTITIONED(hctl));
+
  #ifdef HASH_STATISTICS
         hash_expansions++;
  #endif
  
         new_bucket = hctl->max_bucket + 1;
-       new_segnum = new_bucket >> hctl->sshift;
-       new_segndx = MOD(new_bucket, hctl->ssize);
+       new_segnum = new_bucket >> hashp->sshift;
+       new_segndx = MOD(new_bucket, hashp->ssize);
  
         if (new_segnum >= hctl->nsegs)
         {
@@ -885,8 +1154,8 @@ expand_table(HTAB *hashp)
          * split at this point.  With a different way of reducing the hash value,
          * that might not be true!
          */
-       old_segnum = old_bucket >> hctl->sshift;
-       old_segndx = MOD(old_bucket, hctl->ssize);
+       old_segnum = old_bucket >> hashp->sshift;
+       old_segndx = MOD(old_bucket, hashp->ssize);
  
         old_seg = hashp->dir[old_segnum];
         new_seg = hashp->dir[new_segnum];
@@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp)
         HASHSEGMENT segp;
  
         CurrentDynaHashCxt = hashp->hcxt;
-       segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->hctl->ssize);
+       segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
  
         if (!segp)
                 return NULL;
  
-       MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->hctl->ssize);
+       MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
  
         return segp;
  }
@@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp)
  static bool
  element_alloc(HTAB *hashp, int nelem)
  {
-       HASHHDR    *hctl = hashp->hctl;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile HASHHDR *hctlv = hashp->hctl;
         Size            elementSize;
+       HASHELEMENT *firstElement;
         HASHELEMENT *tmpElement;
+       HASHELEMENT *prevElement;
         int                     i;
  
         /* Each element has a HASHELEMENT header plus user data. */
-       elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+       elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
  
         CurrentDynaHashCxt = hashp->hcxt;
-       tmpElement = (HASHELEMENT *)
-               hashp->alloc(nelem * elementSize);
+       firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
  
-       if (!tmpElement)
+       if (!firstElement)
                 return false;
  
-       /* link all the new entries into the freelist */
+       /* prepare to link all the new entries into the freelist */
+       prevElement = NULL;
+       tmpElement = firstElement;
         for (i = 0; i < nelem; i++)
         {
-               tmpElement->link = hctl->freeList;
-               hctl->freeList = tmpElement;
+               tmpElement->link = prevElement;
+               prevElement = tmpElement;
                 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
         }
  
+       /* if partitioned, must lock to touch freeList */
+       if (IS_PARTITIONED(hctlv))
+               SpinLockAcquire(&hctlv->mutex);
+
+       /* freelist could be nonempty if two backends did this concurrently */
+       firstElement->link = hctlv->freeList;
+       hctlv->freeList = prevElement;
+
+       if (IS_PARTITIONED(hctlv))
+               SpinLockRelease(&hctlv->mutex);
+
         return true;
  }
  
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h

index a89be514d90b9168a9804dfa9d3598caa33ef3e7..c682cf032959de84b397dc6810c33b72a2a2c3fa 100644 (file)
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -1,13 +1,13 @@
  /*-------------------------------------------------------------------------
   *
   * hsearch.h
- *       for hash tables, particularly hash tables in shared memory
+ *       exported definitions for utils/hash/dynahash.c; see notes therein
   *
   *
   * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.43 2006/06/25 18:29:49 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.44 2006/07/22 23:04:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
   */
  typedef void *(*HashAllocFunc) (Size request);
  
-/*
- * Constants
- *
- * A hash table has a top-level "directory", each of whose entries points
- * to a "segment" of ssize bucket headers.     The maximum number of hash
- * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
- * the number of records in the table can be larger, but we don't want a
- * whole lot of records per bucket or performance goes down.
- *
- * In a hash table allocated in shared memory, the directory cannot be
- * expanded because it must stay at a fixed address.  The directory size
- * should be selected using hash_select_dirsize (and you'd better have
- * a good idea of the maximum number of entries!).     For non-shared hash
- * tables, the initial directory size can be left at the default.
- */
-#define DEF_SEGSIZE                       256
-#define DEF_SEGSIZE_SHIFT         8    /* must be log2(DEF_SEGSIZE) */
-#define DEF_DIRSIZE                       256
-#define DEF_FFACTOR                       1    /* default fill factor */
-
-
  /*
   * HASHELEMENT is the private part of a hashtable entry.  The caller's data
   * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
@@ -75,81 +54,42 @@ typedef struct HASHELEMENT
         uint32          hashvalue;              /* hash function result for this entry */
  } HASHELEMENT;
  
-/* A hash bucket is a linked list of HASHELEMENTs */
-typedef HASHELEMENT *HASHBUCKET;
+/* Hash table header struct is an opaque type known only within dynahash.c */
+typedef struct HASHHDR HASHHDR;
  
-/* A hash segment is an array of bucket headers */
-typedef HASHBUCKET *HASHSEGMENT;
-
-/* Header structure for a hash table --- contains all changeable info */
-typedef struct HASHHDR
-{
-       long            dsize;                  /* Directory Size */
-       long            ssize;                  /* Segment Size --- must be power of 2 */
-       int                     sshift;                 /* Segment shift = log2(ssize) */
-       uint32          max_bucket;             /* ID of Maximum bucket in use */
-       uint32          high_mask;              /* Mask to modulo into entire table */
-       uint32          low_mask;               /* Mask to modulo into lower half of table */
-       long            ffactor;                /* Fill factor */
-       long            nentries;               /* Number of entries in hash table */
-       long            nsegs;                  /* Number of allocated segments */
-       Size            keysize;                /* hash key length in bytes */
-       Size            entrysize;              /* total user element size in bytes */
-       long            max_dsize;              /* 'dsize' limit if directory is fixed size */
-       int                     nelem_alloc;    /* number of entries to allocate at once */
-       HASHELEMENT *freeList;          /* linked list of free elements */
-#ifdef HASH_STATISTICS
-       long            accesses;
-       long            collisions;
-#endif
-} HASHHDR;
-
-/*
- * Top control structure for a hashtable --- need not be shared, since
- * no fields change at runtime
- */
-typedef struct HTAB
-{
-       HASHHDR    *hctl;                       /* shared control information */
-       HASHSEGMENT *dir;                       /* directory of segment starts */
-       HashValueFunc hash;                     /* hash function */
-       HashCompareFunc match;          /* key comparison function */
-       HashCopyFunc keycopy;           /* key copying function */
-       HashAllocFunc alloc;            /* memory allocator */
-       MemoryContext hcxt;                     /* memory context if default allocator used */
-       char       *tabname;            /* table name (for error messages) */
-       bool            isshared;               /* true if table is in shared memory */
-} HTAB;
+/* Hash table control struct is an opaque type known only within dynahash.c */
+typedef struct HTAB HTAB;
  
  /* Parameter data structure for hash_create */
  /* Only those fields indicated by hash_flags need be set */
  typedef struct HASHCTL
  {
-       long            ssize;                  /* Segment Size */
-       long            dsize;                  /* (initial) Directory Size */
-       long            max_dsize;              /* limit to dsize if directory size is limited */
-       long            ffactor;                /* Fill factor */
+       long            num_partitions; /* # partitions (must be power of 2) */
+       long            ssize;                  /* segment size */
+       long            dsize;                  /* (initial) directory size */
+       long            max_dsize;              /* limit to dsize if dir size is limited */
+       long            ffactor;                /* fill factor */
         Size            keysize;                /* hash key length in bytes */
         Size            entrysize;              /* total user element size in bytes */
         HashValueFunc hash;                     /* hash function */
         HashCompareFunc match;          /* key comparison function */
         HashCopyFunc keycopy;           /* key copying function */
         HashAllocFunc alloc;            /* memory allocator */
-       HASHSEGMENT *dir;                       /* directory of segment starts */
-       HASHHDR    *hctl;                       /* location of header in shared mem */
         MemoryContext hcxt;                     /* memory context to use for allocations */
+       HASHHDR    *hctl;                       /* location of header in shared mem */
  } HASHCTL;
  
  /* Flags to indicate which parameters are supplied */
+#define HASH_PARTITION 0x001   /* Hashtable is used w/partitioned locking */
  #define HASH_SEGMENT   0x002   /* Set segment size */
-#define HASH_DIRSIZE   0x004   /* Set directory size */
+#define HASH_DIRSIZE   0x004   /* Set directory size (initial and max) */
  #define HASH_FFACTOR   0x008   /* Set fill factor */
  #define HASH_FUNCTION  0x010   /* Set user defined hash function */
-#define HASH_ELEM              0x020   /* Set key/entry size */
+#define HASH_ELEM              0x020   /* Set keysize and entrysize */
  #define HASH_SHARED_MEM 0x040  /* Hashtable is in shared memory */
  #define HASH_ATTACH            0x080   /* Do not initialize hctl */
  #define HASH_ALLOC             0x100   /* Set memory allocator */
-#define HASH_CONTEXT   0x200   /* Set explicit memory context */
+#define HASH_CONTEXT   0x200   /* Set memory allocation context */
  #define HASH_COMPARE   0x400   /* Set user defined comparison function */
  #define HASH_KEYCOPY   0x800   /* Set user defined key-copying function */
  
@@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp);
  extern void hash_stats(const char *where, HTAB *hashp);
  extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
                         bool *foundPtr);
+extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr);
+extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
+                       uint32 hashvalue, HASHACTION action,
+                       bool *foundPtr);
+extern long hash_get_num_entries(HTAB *hashp);
  extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
  extern void *hash_seq_search(HASH_SEQ_STATUS *status);
  extern Size hash_estimate_size(long num_entries, Size entrysize);
  extern long hash_select_dirsize(long num_entries);
+extern Size hash_get_shared_size(HASHCTL *info, int flags);
  
  /*
   * prototypes for functions in hashfn.c
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 22 Jul 2006 23:04:39 +0000 (23:04 +0000)
src/backend/storage/ipc/shmem.c		patch \| blob \| history
src/backend/storage/lmgr/lock.c		patch \| blob \| history
src/backend/utils/hash/dynahash.c		patch \| blob \| history
src/include/utils/hsearch.h		patch \| blob \| history