granicus.if.org Git - postgresql/blob - src/backend/utils/hash/dynahash.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dynahash.c
   4  *        dynamic hash tables
   5  *
   6  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
   7  * shared memory.  For shared hash tables, it is the caller's responsibility
   8  * to provide appropriate access interlocking.  The simplest convention is
   9  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
  10  * hash_seq_search) need only shared lock, but any update requires exclusive
  11  * lock.  For heavily-used shared tables, the single-lock approach creates a
  12  * concurrency bottleneck, so we also support "partitioned" locking wherein
  13  * there are multiple LWLocks guarding distinct subsets of the table.  To use
  14  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
  15  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  16  * Therefore, each hash bucket chain operates independently, and no fields
  17  * of the hash header change after init except nentries and freeList.
  18  * (A partitioned table uses multiple copies of those fields, guarded by
  19  * spinlocks, for additional concurrency.)
  20  * This lets any subset of the hash buckets be treated as a separately
  21  * lockable partition.  We expect callers to use the low-order bits of a
  22  * lookup key's hash value as a partition number --- this will work because
  23  * of the way calc_bucket() maps hash values to bucket numbers.
  24  *
  25  * For hash tables in shared memory, the memory allocator function should
  26  * match malloc's semantics of returning NULL on failure.  For hash tables
  27  * in local memory, we typically use palloc() which will throw error on
  28  * failure.  The code in this file has to cope with both cases.
  29  *
  30  * dynahash.c provides support for these types of lookup keys:
  31  *
  32  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
  33  * compared as though by strcmp().  This is the default behavior.
  34  *
  35  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
  36  * (Caller must ensure there are no undefined padding bits in the keys!)
  37  * This is selected by specifying HASH_BLOBS flag to hash_create.
  38  *
  39  * 3. More complex key behavior can be selected by specifying user-supplied
  40  * hashing, comparison, and/or key-copying functions.  At least a hashing
  41  * function must be supplied; comparison defaults to memcmp() and key copying
  42  * to memcpy() when a user-defined hashing function is selected.
  43  *
  44  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  45  * Portions Copyright (c) 1994, Regents of the University of California
  46  *
  47  *
  48  * IDENTIFICATION
  49  *        src/backend/utils/hash/dynahash.c
  50  *
  51  *-------------------------------------------------------------------------
  52  */
  53
  54 /*
  55  * Original comments:
  56  *
  57  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
  58  * Coded into C, with minor code improvements, and with hsearch(3) interface,
  59  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
  60  * also, hcreate/hdestroy routines added to simulate hsearch(3).
  61  *
  62  * These routines simulate hsearch(3) and family, with the important
  63  * difference that the hash table is dynamic - can grow indefinitely
  64  * beyond its original size (as supplied to hcreate()).
  65  *
  66  * Performance appears to be comparable to that of hsearch(3).
  67  * The 'source-code' options referred to in hsearch(3)'s 'man' page
  68  * are not implemented; otherwise functionality is identical.
  69  *
  70  * Compilation controls:
  71  * HASH_DEBUG controls some informative traces, mainly for debugging.
  72  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
  73  * when combined with HASH_DEBUG, these are displayed by hdestroy().
  74  *
  75  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
  76  * concatenation property, in probably unnecessary code 'optimization'.
  77  *
  78  * Modified margo@postgres.berkeley.edu February 1990
  79  *              added multiple table interface
  80  * Modified by sullivan@postgres.berkeley.edu April 1990
  81  *              changed ctl structure for shared memory
  82  */
  83
  84 #include "postgres.h"
  85
  86 #include <limits.h>
  87
  88 #include "access/xact.h"
  89 #include "storage/shmem.h"
  90 #include "storage/spin.h"
  91 #include "utils/dynahash.h"
  92 #include "utils/memutils.h"
  93
  94
  95 /*
  96  * Constants
  97  *
  98  * A hash table has a top-level "directory", each of whose entries points
  99  * to a "segment" of ssize bucket headers.  The maximum number of hash
 100  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
 101  * the number of records in the table can be larger, but we don't want a
 102  * whole lot of records per bucket or performance goes down.
 103  *
 104  * In a hash table allocated in shared memory, the directory cannot be
 105  * expanded because it must stay at a fixed address.  The directory size
 106  * should be selected using hash_select_dirsize (and you'd better have
 107  * a good idea of the maximum number of entries!).  For non-shared hash
 108  * tables, the initial directory size can be left at the default.
 109  */
 110 #define DEF_SEGSIZE                        256
 111 #define DEF_SEGSIZE_SHIFT          8    /* must be log2(DEF_SEGSIZE) */
 112 #define DEF_DIRSIZE                        256
 113 #define DEF_FFACTOR                        1    /* default fill factor */
 114
 115 /* Number of freelists to be used for a partitioned hash table. */
 116 #define NUM_FREELISTS                   32
 117
 118 /* A hash bucket is a linked list of HASHELEMENTs */
 119 typedef HASHELEMENT *HASHBUCKET;
 120
 121 /* A hash segment is an array of bucket headers */
 122 typedef HASHBUCKET *HASHSEGMENT;
 123
 124 /*
 125  * Per-freelist data.
 126  *
 127  * In a partitioned hash table, each freelist is associated with a specific
 128  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
 129  * nentries tracks the number of live hashtable entries having those hashcodes
 130  * (NOT the number of entries in the freelist, as you might expect).
 131  *
 132  * The coverage of a freelist might be more or less than one partition, so it
 133  * needs its own lock rather than relying on caller locking.  Relying on that
 134  * wouldn't work even if the coverage was the same, because of the occasional
 135  * need to "borrow" entries from another freelist; see get_hash_entry().
 136  *
 137  * Using an array of FreeListData instead of separate arrays of mutexes,
 138  * nentries and freeLists helps to reduce sharing of cache lines between
 139  * different mutexes.
 140  */
 141 typedef struct
 142 {
 143         slock_t         mutex;                  /* spinlock for this freelist */
 144         long            nentries;               /* number of entries in associated buckets */
 145         HASHELEMENT *freeList;          /* chain of free elements */
 146 } FreeListData;
 147
 148 /*
 149  * Header structure for a hash table --- contains all changeable info
 150  *
 151  * In a shared-memory hash table, the HASHHDR is in shared memory, while
 152  * each backend has a local HTAB struct.  For a non-shared table, there isn't
 153  * any functional difference between HASHHDR and HTAB, but we separate them
 154  * anyway to share code between shared and non-shared tables.
 155  */
 156 struct HASHHDR
 157 {
 158         /*
 159          * The freelist can become a point of contention in high-concurrency hash
 160          * tables, so we use an array of freelists, each with its own mutex and
 161          * nentries count, instead of just a single one.  Although the freelists
 162          * normally operate independently, we will scavenge entries from freelists
 163          * other than a hashcode's default freelist when necessary.
 164          *
 165          * If the hash table is not partitioned, only freeList[0] is used and its
 166          * spinlock is not used at all; callers' locking is assumed sufficient.
 167          */
 168         FreeListData freeList[NUM_FREELISTS];
 169
 170         /* These fields can change, but not in a partitioned table */
 171         /* Also, dsize can't change in a shared table, even if unpartitioned */
 172         long            dsize;                  /* directory size */
 173         long            nsegs;                  /* number of allocated segments (<= dsize) */
 174         uint32          max_bucket;             /* ID of maximum bucket in use */
 175         uint32          high_mask;              /* mask to modulo into entire table */
 176         uint32          low_mask;               /* mask to modulo into lower half of table */
 177
 178         /* These fields are fixed at hashtable creation */
 179         Size            keysize;                /* hash key length in bytes */
 180         Size            entrysize;              /* total user element size in bytes */
 181         long            num_partitions; /* # partitions (must be power of 2), or 0 */
 182         long            ffactor;                /* target fill factor */
 183         long            max_dsize;              /* 'dsize' limit if directory is fixed size */
 184         long            ssize;                  /* segment size --- must be power of 2 */
 185         int                     sshift;                 /* segment shift = log2(ssize) */
 186         int                     nelem_alloc;    /* number of entries to allocate at once */
 187
 188 #ifdef HASH_STATISTICS
 189
 190         /*
 191          * Count statistics here.  NB: stats code doesn't bother with mutex, so
 192          * counts could be corrupted a bit in a partitioned table.
 193          */
 194         long            accesses;
 195         long            collisions;
 196 #endif
 197 };
 198
 199 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 200
 201 #define FREELIST_IDX(hctl, hashcode) \
 202         (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
 203
 204 /*
 205  * Top control structure for a hashtable --- in a shared table, each backend
 206  * has its own copy (OK since no fields change at runtime)
 207  */
 208 struct HTAB
 209 {
 210         HASHHDR    *hctl;                       /* => shared control information */
 211         HASHSEGMENT *dir;                       /* directory of segment starts */
 212         HashValueFunc hash;                     /* hash function */
 213         HashCompareFunc match;          /* key comparison function */
 214         HashCopyFunc keycopy;           /* key copying function */
 215         HashAllocFunc alloc;            /* memory allocator */
 216         MemoryContext hcxt;                     /* memory context if default allocator used */
 217         char       *tabname;            /* table name (for error messages) */
 218         bool            isshared;               /* true if table is in shared memory */
 219         bool            isfixed;                /* if true, don't enlarge */
 220
 221         /* freezing a shared table isn't allowed, so we can keep state here */
 222         bool            frozen;                 /* true = no more inserts allowed */
 223
 224         /* We keep local copies of these fixed values to reduce contention */
 225         Size            keysize;                /* hash key length in bytes */
 226         long            ssize;                  /* segment size --- must be power of 2 */
 227         int                     sshift;                 /* segment shift = log2(ssize) */
 228 };
 229
 230 /*
 231  * Key (also entry) part of a HASHELEMENT
 232  */
 233 #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
 234
 235 /*
 236  * Obtain element pointer given pointer to key
 237  */
 238 #define ELEMENT_FROM_KEY(key)  \
 239         ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
 240
 241 /*
 242  * Fast MOD arithmetic, assuming that y is a power of 2 !
 243  */
 244 #define MOD(x,y)                           ((x) & ((y)-1))
 245
 246 #if HASH_STATISTICS
 247 static long hash_accesses,
 248                         hash_collisions,
 249                         hash_expansions;
 250 #endif
 251
 252 /*
 253  * Private function prototypes
 254  */
 255 static void *DynaHashAlloc(Size size);
 256 static HASHSEGMENT seg_alloc(HTAB *hashp);
 257 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 258 static bool dir_realloc(HTAB *hashp);
 259 static bool expand_table(HTAB *hashp);
 260 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 261 static void hdefault(HTAB *hashp);
 262 static int      choose_nelem_alloc(Size entrysize);
 263 static bool init_htab(HTAB *hashp, long nelem);
 264 static void hash_corrupted(HTAB *hashp);
 265 static long next_pow2_long(long num);
 266 static int      next_pow2_int(long num);
 267 static void register_seq_scan(HTAB *hashp);
 268 static void deregister_seq_scan(HTAB *hashp);
 269 static bool has_seq_scans(HTAB *hashp);
 270
 271
 272 /*
 273  * memory allocation support
 274  */
 275 static MemoryContext CurrentDynaHashCxt = NULL;
 276
 277 static void *
 278 DynaHashAlloc(Size size)
 279 {
 280         Assert(MemoryContextIsValid(CurrentDynaHashCxt));
 281         return MemoryContextAlloc(CurrentDynaHashCxt, size);
 282 }
 283
 284
 285 /*
 286  * HashCompareFunc for string keys
 287  *
 288  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
 289  * bytes, so we can only compare that many ... hence strncmp is almost but
 290  * not quite the right thing.
 291  */
 292 static int
 293 string_compare(const char *key1, const char *key2, Size keysize)
 294 {
 295         return strncmp(key1, key2, keysize - 1);
 296 }
 297
 298
 299 /************************** CREATE ROUTINES **********************/
 300
 301 /*
 302  * hash_create -- create a new dynamic hash table
 303  *
 304  *      tabname: a name for the table (for debugging purposes)
 305  *      nelem: maximum number of elements expected
 306  *      *info: additional table parameters, as indicated by flags
 307  *      flags: bitmask indicating which parameters to take from *info
 308  *
 309  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
 310  * estimate, since we can't expand the table on the fly.  But an unshared
 311  * hashtable can be expanded on-the-fly, so it's better for nelem to be
 312  * on the small side and let the table grow if it's exceeded.  An overly
 313  * large nelem will penalize hash_seq_search speed without buying much.
 314  */
 315 HTAB *
 316 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 317 {
 318         HTAB       *hashp;
 319         HASHHDR    *hctl;
 320
 321         /*
 322          * For shared hash tables, we have a local hash header (HTAB struct) that
 323          * we allocate in TopMemoryContext; all else is in shared memory.
 324          *
 325          * For non-shared hash tables, everything including the hash header is in
 326          * a memory context created specially for the hash table --- this makes
 327          * hash_destroy very simple.  The memory context is made a child of either
 328          * a context specified by the caller, or TopMemoryContext if nothing is
 329          * specified.
 330          */
 331         if (flags & HASH_SHARED_MEM)
 332         {
 333                 /* Set up to allocate the hash header */
 334                 CurrentDynaHashCxt = TopMemoryContext;
 335         }
 336         else
 337         {
 338                 /* Create the hash table's private memory context */
 339                 if (flags & HASH_CONTEXT)
 340                         CurrentDynaHashCxt = info->hcxt;
 341                 else
 342                         CurrentDynaHashCxt = TopMemoryContext;
 343                 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
 344                                                                                                    "dynahash",
 345                                                                                                    ALLOCSET_DEFAULT_SIZES);
 346         }
 347
 348         /* Initialize the hash header, plus a copy of the table name */
 349         hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
 350         MemSet(hashp, 0, sizeof(HTAB));
 351
 352         hashp->tabname = (char *) (hashp + 1);
 353         strcpy(hashp->tabname, tabname);
 354
 355         /* If we have a private context, label it with hashtable's name */
 356         if (!(flags & HASH_SHARED_MEM))
 357                 MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
 358
 359         /*
 360          * Select the appropriate hash function (see comments at head of file).
 361          */
 362         if (flags & HASH_FUNCTION)
 363                 hashp->hash = info->hash;
 364         else if (flags & HASH_BLOBS)
 365         {
 366                 /* We can optimize hashing for common key sizes */
 367                 Assert(flags & HASH_ELEM);
 368                 if (info->keysize == sizeof(uint32))
 369                         hashp->hash = uint32_hash;
 370                 else
 371                         hashp->hash = tag_hash;
 372         }
 373         else
 374                 hashp->hash = string_hash;      /* default hash function */
 375
 376         /*
 377          * If you don't specify a match function, it defaults to string_compare if
 378          * you used string_hash (either explicitly or by default) and to memcmp
 379          * otherwise.
 380          *
 381          * Note: explicitly specifying string_hash is deprecated, because this
 382          * might not work for callers in loadable modules on some platforms due to
 383          * referencing a trampoline instead of the string_hash function proper.
 384          * Just let it default, eh?
 385          */
 386         if (flags & HASH_COMPARE)
 387                 hashp->match = info->match;
 388         else if (hashp->hash == string_hash)
 389                 hashp->match = (HashCompareFunc) string_compare;
 390         else
 391                 hashp->match = memcmp;
 392
 393         /*
 394          * Similarly, the key-copying function defaults to strlcpy or memcpy.
 395          */
 396         if (flags & HASH_KEYCOPY)
 397                 hashp->keycopy = info->keycopy;
 398         else if (hashp->hash == string_hash)
 399                 hashp->keycopy = (HashCopyFunc) strlcpy;
 400         else
 401                 hashp->keycopy = memcpy;
 402
 403         /* And select the entry allocation function, too. */
 404         if (flags & HASH_ALLOC)
 405                 hashp->alloc = info->alloc;
 406         else
 407                 hashp->alloc = DynaHashAlloc;
 408
 409         if (flags & HASH_SHARED_MEM)
 410         {
 411                 /*
 412                  * ctl structure and directory are preallocated for shared memory
 413                  * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
 414                  * well.
 415                  */
 416                 hashp->hctl = info->hctl;
 417                 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 418                 hashp->hcxt = NULL;
 419                 hashp->isshared = true;
 420
 421                 /* hash table already exists, we're just attaching to it */
 422                 if (flags & HASH_ATTACH)
 423                 {
 424                         /* make local copies of some heavily-used values */
 425                         hctl = hashp->hctl;
 426                         hashp->keysize = hctl->keysize;
 427                         hashp->ssize = hctl->ssize;
 428                         hashp->sshift = hctl->sshift;
 429
 430                         return hashp;
 431                 }
 432         }
 433         else
 434         {
 435                 /* setup hash table defaults */
 436                 hashp->hctl = NULL;
 437                 hashp->dir = NULL;
 438                 hashp->hcxt = CurrentDynaHashCxt;
 439                 hashp->isshared = false;
 440         }
 441
 442         if (!hashp->hctl)
 443         {
 444                 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
 445                 if (!hashp->hctl)
 446                         ereport(ERROR,
 447                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 448                                          errmsg("out of memory")));
 449         }
 450
 451         hashp->frozen = false;
 452
 453         hdefault(hashp);
 454
 455         hctl = hashp->hctl;
 456
 457         if (flags & HASH_PARTITION)
 458         {
 459                 /* Doesn't make sense to partition a local hash table */
 460                 Assert(flags & HASH_SHARED_MEM);
 461
 462                 /*
 463                  * The number of partitions had better be a power of 2. Also, it must
 464                  * be less than INT_MAX (see init_htab()), so call the int version of
 465                  * next_pow2.
 466                  */
 467                 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
 468
 469                 hctl->num_partitions = info->num_partitions;
 470         }
 471
 472         if (flags & HASH_SEGMENT)
 473         {
 474                 hctl->ssize = info->ssize;
 475                 hctl->sshift = my_log2(info->ssize);
 476                 /* ssize had better be a power of 2 */
 477                 Assert(hctl->ssize == (1L << hctl->sshift));
 478         }
 479         if (flags & HASH_FFACTOR)
 480                 hctl->ffactor = info->ffactor;
 481
 482         /*
 483          * SHM hash tables have fixed directory size passed by the caller.
 484          */
 485         if (flags & HASH_DIRSIZE)
 486         {
 487                 hctl->max_dsize = info->max_dsize;
 488                 hctl->dsize = info->dsize;
 489         }
 490
 491         /*
 492          * hash table now allocates space for key and data but you have to say how
 493          * much space to allocate
 494          */
 495         if (flags & HASH_ELEM)
 496         {
 497                 Assert(info->entrysize >= info->keysize);
 498                 hctl->keysize = info->keysize;
 499                 hctl->entrysize = info->entrysize;
 500         }
 501
 502         /* make local copies of heavily-used constant fields */
 503         hashp->keysize = hctl->keysize;
 504         hashp->ssize = hctl->ssize;
 505         hashp->sshift = hctl->sshift;
 506
 507         /* Build the hash directory structure */
 508         if (!init_htab(hashp, nelem))
 509                 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
 510
 511         /*
 512          * For a shared hash table, preallocate the requested number of elements.
 513          * This reduces problems with run-time out-of-shared-memory conditions.
 514          *
 515          * For a non-shared hash table, preallocate the requested number of
 516          * elements if it's less than our chosen nelem_alloc.  This avoids wasting
 517          * space if the caller correctly estimates a small table size.
 518          */
 519         if ((flags & HASH_SHARED_MEM) ||
 520                 nelem < hctl->nelem_alloc)
 521         {
 522                 int                     i,
 523                                         freelist_partitions,
 524                                         nelem_alloc,
 525                                         nelem_alloc_first;
 526
 527                 /*
 528                  * If hash table is partitioned, give each freelist an equal share of
 529                  * the initial allocation.  Otherwise only freeList[0] is used.
 530                  */
 531                 if (IS_PARTITIONED(hashp->hctl))
 532                         freelist_partitions = NUM_FREELISTS;
 533                 else
 534                         freelist_partitions = 1;
 535
 536                 nelem_alloc = nelem / freelist_partitions;
 537                 if (nelem_alloc <= 0)
 538                         nelem_alloc = 1;
 539
 540                 /*
 541                  * Make sure we'll allocate all the requested elements; freeList[0]
 542                  * gets the excess if the request isn't divisible by NUM_FREELISTS.
 543                  */
 544                 if (nelem_alloc * freelist_partitions < nelem)
 545                         nelem_alloc_first =
 546                                 nelem - nelem_alloc * (freelist_partitions - 1);
 547                 else
 548                         nelem_alloc_first = nelem_alloc;
 549
 550                 for (i = 0; i < freelist_partitions; i++)
 551                 {
 552                         int                     temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
 553
 554                         if (!element_alloc(hashp, temp, i))
 555                                 ereport(ERROR,
 556                                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 557                                                  errmsg("out of memory")));
 558                 }
 559         }
 560
 561         if (flags & HASH_FIXED_SIZE)
 562                 hashp->isfixed = true;
 563         return hashp;
 564 }
 565
 566 /*
 567  * Set default HASHHDR parameters.
 568  */
 569 static void
 570 hdefault(HTAB *hashp)
 571 {
 572         HASHHDR    *hctl = hashp->hctl;
 573
 574         MemSet(hctl, 0, sizeof(HASHHDR));
 575
 576         hctl->dsize = DEF_DIRSIZE;
 577         hctl->nsegs = 0;
 578
 579         /* rather pointless defaults for key & entry size */
 580         hctl->keysize = sizeof(char *);
 581         hctl->entrysize = 2 * sizeof(char *);
 582
 583         hctl->num_partitions = 0;       /* not partitioned */
 584
 585         hctl->ffactor = DEF_FFACTOR;
 586
 587         /* table has no fixed maximum size */
 588         hctl->max_dsize = NO_MAX_DSIZE;
 589
 590         hctl->ssize = DEF_SEGSIZE;
 591         hctl->sshift = DEF_SEGSIZE_SHIFT;
 592
 593 #ifdef HASH_STATISTICS
 594         hctl->accesses = hctl->collisions = 0;
 595 #endif
 596 }
 597
 598 /*
 599  * Given the user-specified entry size, choose nelem_alloc, ie, how many
 600  * elements to add to the hash table when we need more.
 601  */
 602 static int
 603 choose_nelem_alloc(Size entrysize)
 604 {
 605         int                     nelem_alloc;
 606         Size            elementSize;
 607         Size            allocSize;
 608
 609         /* Each element has a HASHELEMENT header plus user data. */
 610         /* NB: this had better match element_alloc() */
 611         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 612
 613         /*
 614          * The idea here is to choose nelem_alloc at least 32, but round up so
 615          * that the allocation request will be a power of 2 or just less. This
 616          * makes little difference for hash tables in shared memory, but for hash
 617          * tables managed by palloc, the allocation request will be rounded up to
 618          * a power of 2 anyway.  If we fail to take this into account, we'll waste
 619          * as much as half the allocated space.
 620          */
 621         allocSize = 32 * 4;                     /* assume elementSize at least 8 */
 622         do
 623         {
 624                 allocSize <<= 1;
 625                 nelem_alloc = allocSize / elementSize;
 626         } while (nelem_alloc < 32);
 627
 628         return nelem_alloc;
 629 }
 630
 631 /*
 632  * Compute derived fields of hctl and build the initial directory/segment
 633  * arrays
 634  */
 635 static bool
 636 init_htab(HTAB *hashp, long nelem)
 637 {
 638         HASHHDR    *hctl = hashp->hctl;
 639         HASHSEGMENT *segp;
 640         int                     nbuckets;
 641         int                     nsegs;
 642         int                     i;
 643
 644         /*
 645          * initialize mutexes if it's a partitioned table
 646          */
 647         if (IS_PARTITIONED(hctl))
 648                 for (i = 0; i < NUM_FREELISTS; i++)
 649                         SpinLockInit(&(hctl->freeList[i].mutex));
 650
 651         /*
 652          * Divide number of elements by the fill factor to determine a desired
 653          * number of buckets.  Allocate space for the next greater power of two
 654          * number of buckets
 655          */
 656         nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
 657
 658         /*
 659          * In a partitioned table, nbuckets must be at least equal to
 660          * num_partitions; were it less, keys with apparently different partition
 661          * numbers would map to the same bucket, breaking partition independence.
 662          * (Normally nbuckets will be much bigger; this is just a safety check.)
 663          */
 664         while (nbuckets < hctl->num_partitions)
 665                 nbuckets <<= 1;
 666
 667         hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 668         hctl->high_mask = (nbuckets << 1) - 1;
 669
 670         /*
 671          * Figure number of directory segments needed, round up to a power of 2
 672          */
 673         nsegs = (nbuckets - 1) / hctl->ssize + 1;
 674         nsegs = next_pow2_int(nsegs);
 675
 676         /*
 677          * Make sure directory is big enough. If pre-allocated directory is too
 678          * small, choke (caller screwed up).
 679          */
 680         if (nsegs > hctl->dsize)
 681         {
 682                 if (!(hashp->dir))
 683                         hctl->dsize = nsegs;
 684                 else
 685                         return false;
 686         }
 687
 688         /* Allocate a directory */
 689         if (!(hashp->dir))
 690         {
 691                 CurrentDynaHashCxt = hashp->hcxt;
 692                 hashp->dir = (HASHSEGMENT *)
 693                         hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
 694                 if (!hashp->dir)
 695                         return false;
 696         }
 697
 698         /* Allocate initial segments */
 699         for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
 700         {
 701                 *segp = seg_alloc(hashp);
 702                 if (*segp == NULL)
 703                         return false;
 704         }
 705
 706         /* Choose number of entries to allocate at a time */
 707         hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
 708
 709 #if HASH_DEBUG
 710         fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
 711                         "TABLE POINTER   ", hashp,
 712                         "DIRECTORY SIZE  ", hctl->dsize,
 713                         "SEGMENT SIZE    ", hctl->ssize,
 714                         "SEGMENT SHIFT   ", hctl->sshift,
 715                         "FILL FACTOR     ", hctl->ffactor,
 716                         "MAX BUCKET      ", hctl->max_bucket,
 717                         "HIGH MASK       ", hctl->high_mask,
 718                         "LOW  MASK       ", hctl->low_mask,
 719                         "NSEGS           ", hctl->nsegs);
 720 #endif
 721         return true;
 722 }
 723
 724 /*
 725  * Estimate the space needed for a hashtable containing the given number
 726  * of entries of given size.
 727  * NOTE: this is used to estimate the footprint of hashtables in shared
 728  * memory; therefore it does not count HTAB which is in local memory.
 729  * NB: assumes that all hash structure parameters have default values!
 730  */
 731 Size
 732 hash_estimate_size(long num_entries, Size entrysize)
 733 {
 734         Size            size;
 735         long            nBuckets,
 736                                 nSegments,
 737                                 nDirEntries,
 738                                 nElementAllocs,
 739                                 elementSize,
 740                                 elementAllocCnt;
 741
 742         /* estimate number of buckets wanted */
 743         nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
 744         /* # of segments needed for nBuckets */
 745         nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
 746         /* directory entries */
 747         nDirEntries = DEF_DIRSIZE;
 748         while (nDirEntries < nSegments)
 749                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 750
 751         /* fixed control info */
 752         size = MAXALIGN(sizeof(HASHHDR));       /* but not HTAB, per above */
 753         /* directory */
 754         size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
 755         /* segments */
 756         size = add_size(size, mul_size(nSegments,
 757                                                                    MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
 758         /* elements --- allocated in groups of choose_nelem_alloc() entries */
 759         elementAllocCnt = choose_nelem_alloc(entrysize);
 760         nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
 761         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 762         size = add_size(size,
 763                                         mul_size(nElementAllocs,
 764                                                          mul_size(elementAllocCnt, elementSize)));
 765
 766         return size;
 767 }
 768
 769 /*
 770  * Select an appropriate directory size for a hashtable with the given
 771  * maximum number of entries.
 772  * This is only needed for hashtables in shared memory, whose directories
 773  * cannot be expanded dynamically.
 774  * NB: assumes that all hash structure parameters have default values!
 775  *
 776  * XXX this had better agree with the behavior of init_htab()...
 777  */
 778 long
 779 hash_select_dirsize(long num_entries)
 780 {
 781         long            nBuckets,
 782                                 nSegments,
 783                                 nDirEntries;
 784
 785         /* estimate number of buckets wanted */
 786         nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
 787         /* # of segments needed for nBuckets */
 788         nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
 789         /* directory entries */
 790         nDirEntries = DEF_DIRSIZE;
 791         while (nDirEntries < nSegments)
 792                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 793
 794         return nDirEntries;
 795 }
 796
 797 /*
 798  * Compute the required initial memory allocation for a shared-memory
 799  * hashtable with the given parameters.  We need space for the HASHHDR
 800  * and for the (non expansible) directory.
 801  */
 802 Size
 803 hash_get_shared_size(HASHCTL *info, int flags)
 804 {
 805         Assert(flags & HASH_DIRSIZE);
 806         Assert(info->dsize == info->max_dsize);
 807         return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
 808 }
 809
 810
 811 /********************** DESTROY ROUTINES ************************/
 812
 813 void
 814 hash_destroy(HTAB *hashp)
 815 {
 816         if (hashp != NULL)
 817         {
 818                 /* allocation method must be one we know how to free, too */
 819                 Assert(hashp->alloc == DynaHashAlloc);
 820                 /* so this hashtable must have its own context */
 821                 Assert(hashp->hcxt != NULL);
 822
 823                 hash_stats("destroy", hashp);
 824
 825                 /*
 826                  * Free everything by destroying the hash table's memory context.
 827                  */
 828                 MemoryContextDelete(hashp->hcxt);
 829         }
 830 }
 831
 832 void
 833 hash_stats(const char *where, HTAB *hashp)
 834 {
 835 #if HASH_STATISTICS
 836         fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
 837                         where, hashp->hctl->accesses, hashp->hctl->collisions);
 838
 839         fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
 840                         hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 841                         hashp->hctl->max_bucket, hashp->hctl->nsegs);
 842         fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 843                         where, hash_accesses, hash_collisions);
 844         fprintf(stderr, "hash_stats: total expansions %ld\n",
 845                         hash_expansions);
 846 #endif
 847 }
 848
 849 /*******************************SEARCH ROUTINES *****************************/
 850
 851
 852 /*
 853  * get_hash_value -- exported routine to calculate a key's hash value
 854  *
 855  * We export this because for partitioned tables, callers need to compute
 856  * the partition number (from the low-order bits of the hash value) before
 857  * searching.
 858  */
 859 uint32
 860 get_hash_value(HTAB *hashp, const void *keyPtr)
 861 {
 862         return hashp->hash(keyPtr, hashp->keysize);
 863 }
 864
 865 /* Convert a hash value to a bucket number */
 866 static inline uint32
 867 calc_bucket(HASHHDR *hctl, uint32 hash_val)
 868 {
 869         uint32          bucket;
 870
 871         bucket = hash_val & hctl->high_mask;
 872         if (bucket > hctl->max_bucket)
 873                 bucket = bucket & hctl->low_mask;
 874
 875         return bucket;
 876 }
 877
 878 /*
 879  * hash_search -- look up key in table and perform action
 880  * hash_search_with_hash_value -- same, with key's hash value already computed
 881  *
 882  * action is one of:
 883  *              HASH_FIND: look up key in table
 884  *              HASH_ENTER: look up key in table, creating entry if not present
 885  *              HASH_ENTER_NULL: same, but return NULL if out of memory
 886  *              HASH_REMOVE: look up key in table, remove entry if present
 887  *
 888  * Return value is a pointer to the element found/entered/removed if any,
 889  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
 890  * the result is a dangling pointer that shouldn't be dereferenced!)
 891  *
 892  * HASH_ENTER will normally ereport a generic "out of memory" error if
 893  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
 894  * the same except it will return NULL if out of memory.  Note that
 895  * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
 896  * since palloc internally ereports on out-of-memory.
 897  *
 898  * If foundPtr isn't NULL, then *foundPtr is set true if we found an
 899  * existing entry in the table, false otherwise.  This is needed in the
 900  * HASH_ENTER case, but is redundant with the return value otherwise.
 901  *
 902  * For hash_search_with_hash_value, the hashvalue parameter must have been
 903  * calculated with get_hash_value().
 904  */
 905 void *
 906 hash_search(HTAB *hashp,
 907                         const void *keyPtr,
 908                         HASHACTION action,
 909                         bool *foundPtr)
 910 {
 911         return hash_search_with_hash_value(hashp,
 912                                                                            keyPtr,
 913                                                                            hashp->hash(keyPtr, hashp->keysize),
 914                                                                            action,
 915                                                                            foundPtr);
 916 }
 917
 918 void *
 919 hash_search_with_hash_value(HTAB *hashp,
 920                                                         const void *keyPtr,
 921                                                         uint32 hashvalue,
 922                                                         HASHACTION action,
 923                                                         bool *foundPtr)
 924 {
 925         HASHHDR    *hctl = hashp->hctl;
 926         int                     freelist_idx = FREELIST_IDX(hctl, hashvalue);
 927         Size            keysize;
 928         uint32          bucket;
 929         long            segment_num;
 930         long            segment_ndx;
 931         HASHSEGMENT segp;
 932         HASHBUCKET      currBucket;
 933         HASHBUCKET *prevBucketPtr;
 934         HashCompareFunc match;
 935
 936 #if HASH_STATISTICS
 937         hash_accesses++;
 938         hctl->accesses++;
 939 #endif
 940
 941         /*
 942          * If inserting, check if it is time to split a bucket.
 943          *
 944          * NOTE: failure to expand table is not a fatal error, it just means we
 945          * have to run at higher fill factor than we wanted.  However, if we're
 946          * using the palloc allocator then it will throw error anyway on
 947          * out-of-memory, so we must do this before modifying the table.
 948          */
 949         if (action == HASH_ENTER || action == HASH_ENTER_NULL)
 950         {
 951                 /*
 952                  * Can't split if running in partitioned mode, nor if frozen, nor if
 953                  * table is the subject of any active hash_seq_search scans.  Strange
 954                  * order of these tests is to try to check cheaper conditions first.
 955                  */
 956                 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
 957                         hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
 958                         !has_seq_scans(hashp))
 959                         (void) expand_table(hashp);
 960         }
 961
 962         /*
 963          * Do the initial lookup
 964          */
 965         bucket = calc_bucket(hctl, hashvalue);
 966
 967         segment_num = bucket >> hashp->sshift;
 968         segment_ndx = MOD(bucket, hashp->ssize);
 969
 970         segp = hashp->dir[segment_num];
 971
 972         if (segp == NULL)
 973                 hash_corrupted(hashp);
 974
 975         prevBucketPtr = &segp[segment_ndx];
 976         currBucket = *prevBucketPtr;
 977
 978         /*
 979          * Follow collision chain looking for matching key
 980          */
 981         match = hashp->match;           /* save one fetch in inner loop */
 982         keysize = hashp->keysize;       /* ditto */
 983
 984         while (currBucket != NULL)
 985         {
 986                 if (currBucket->hashvalue == hashvalue &&
 987                         match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
 988                         break;
 989                 prevBucketPtr = &(currBucket->link);
 990                 currBucket = *prevBucketPtr;
 991 #if HASH_STATISTICS
 992                 hash_collisions++;
 993                 hctl->collisions++;
 994 #endif
 995         }
 996
 997         if (foundPtr)
 998                 *foundPtr = (bool) (currBucket != NULL);
 999
1000         /*
1001          * OK, now what?
1002          */
1003         switch (action)
1004         {
1005                 case HASH_FIND:
1006                         if (currBucket != NULL)
1007                                 return (void *) ELEMENTKEY(currBucket);
1008                         return NULL;
1009
1010                 case HASH_REMOVE:
1011                         if (currBucket != NULL)
1012                         {
1013                                 /* if partitioned, must lock to touch nentries and freeList */
1014                                 if (IS_PARTITIONED(hctl))
1015                                         SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1016
1017                                 /* delete the record from the appropriate nentries counter. */
1018                                 Assert(hctl->freeList[freelist_idx].nentries > 0);
1019                                 hctl->freeList[freelist_idx].nentries--;
1020
1021                                 /* remove record from hash bucket's chain. */
1022                                 *prevBucketPtr = currBucket->link;
1023
1024                                 /* add the record to the appropriate freelist. */
1025                                 currBucket->link = hctl->freeList[freelist_idx].freeList;
1026                                 hctl->freeList[freelist_idx].freeList = currBucket;
1027
1028                                 if (IS_PARTITIONED(hctl))
1029                                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1030
1031                                 /*
1032                                  * better hope the caller is synchronizing access to this
1033                                  * element, because someone else is going to reuse it the next
1034                                  * time something is added to the table
1035                                  */
1036                                 return (void *) ELEMENTKEY(currBucket);
1037                         }
1038                         return NULL;
1039
1040                 case HASH_ENTER_NULL:
1041                         /* ENTER_NULL does not work with palloc-based allocator */
1042                         Assert(hashp->alloc != DynaHashAlloc);
1043                         /* FALL THRU */
1044
1045                 case HASH_ENTER:
1046                         /* Return existing element if found, else create one */
1047                         if (currBucket != NULL)
1048                                 return (void *) ELEMENTKEY(currBucket);
1049
1050                         /* disallow inserts if frozen */
1051                         if (hashp->frozen)
1052                                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1053                                          hashp->tabname);
1054
1055                         currBucket = get_hash_entry(hashp, freelist_idx);
1056                         if (currBucket == NULL)
1057                         {
1058                                 /* out of memory */
1059                                 if (action == HASH_ENTER_NULL)
1060                                         return NULL;
1061                                 /* report a generic message */
1062                                 if (hashp->isshared)
1063                                         ereport(ERROR,
1064                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1065                                                          errmsg("out of shared memory")));
1066                                 else
1067                                         ereport(ERROR,
1068                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1069                                                          errmsg("out of memory")));
1070                         }
1071
1072                         /* link into hashbucket chain */
1073                         *prevBucketPtr = currBucket;
1074                         currBucket->link = NULL;
1075
1076                         /* copy key into record */
1077                         currBucket->hashvalue = hashvalue;
1078                         hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1079
1080                         /*
1081                          * Caller is expected to fill the data field on return.  DO NOT
1082                          * insert any code that could possibly throw error here, as doing
1083                          * so would leave the table entry incomplete and hence corrupt the
1084                          * caller's data structure.
1085                          */
1086
1087                         return (void *) ELEMENTKEY(currBucket);
1088         }
1089
1090         elog(ERROR, "unrecognized hash action code: %d", (int) action);
1091
1092         return NULL;                            /* keep compiler quiet */
1093 }
1094
1095 /*
1096  * hash_update_hash_key -- change the hash key of an existing table entry
1097  *
1098  * This is equivalent to removing the entry, making a new entry, and copying
1099  * over its data, except that the entry never goes to the table's freelist.
1100  * Therefore this cannot suffer an out-of-memory failure, even if there are
1101  * other processes operating in other partitions of the hashtable.
1102  *
1103  * Returns true if successful, false if the requested new hash key is already
1104  * present.  Throws error if the specified entry pointer isn't actually a
1105  * table member.
1106  *
1107  * NB: currently, there is no special case for old and new hash keys being
1108  * identical, which means we'll report false for that situation.  This is
1109  * preferable for existing uses.
1110  *
1111  * NB: for a partitioned hashtable, caller must hold lock on both relevant
1112  * partitions, if the new hash key would belong to a different partition.
1113  */
1114 bool
1115 hash_update_hash_key(HTAB *hashp,
1116                                          void *existingEntry,
1117                                          const void *newKeyPtr)
1118 {
1119         HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1120         HASHHDR    *hctl = hashp->hctl;
1121         uint32          newhashvalue;
1122         Size            keysize;
1123         uint32          bucket;
1124         uint32          newbucket;
1125         long            segment_num;
1126         long            segment_ndx;
1127         HASHSEGMENT segp;
1128         HASHBUCKET      currBucket;
1129         HASHBUCKET *prevBucketPtr;
1130         HASHBUCKET *oldPrevPtr;
1131         HashCompareFunc match;
1132
1133 #if HASH_STATISTICS
1134         hash_accesses++;
1135         hctl->accesses++;
1136 #endif
1137
1138         /* disallow updates if frozen */
1139         if (hashp->frozen)
1140                 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1141                          hashp->tabname);
1142
1143         /*
1144          * Lookup the existing element using its saved hash value.  We need to do
1145          * this to be able to unlink it from its hash chain, but as a side benefit
1146          * we can verify the validity of the passed existingEntry pointer.
1147          */
1148         bucket = calc_bucket(hctl, existingElement->hashvalue);
1149
1150         segment_num = bucket >> hashp->sshift;
1151         segment_ndx = MOD(bucket, hashp->ssize);
1152
1153         segp = hashp->dir[segment_num];
1154
1155         if (segp == NULL)
1156                 hash_corrupted(hashp);
1157
1158         prevBucketPtr = &segp[segment_ndx];
1159         currBucket = *prevBucketPtr;
1160
1161         while (currBucket != NULL)
1162         {
1163                 if (currBucket == existingElement)
1164                         break;
1165                 prevBucketPtr = &(currBucket->link);
1166                 currBucket = *prevBucketPtr;
1167         }
1168
1169         if (currBucket == NULL)
1170                 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1171                          hashp->tabname);
1172
1173         oldPrevPtr = prevBucketPtr;
1174
1175         /*
1176          * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1177          * chain we want to put the entry into.
1178          */
1179         newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1180
1181         newbucket = calc_bucket(hctl, newhashvalue);
1182
1183         segment_num = newbucket >> hashp->sshift;
1184         segment_ndx = MOD(newbucket, hashp->ssize);
1185
1186         segp = hashp->dir[segment_num];
1187
1188         if (segp == NULL)
1189                 hash_corrupted(hashp);
1190
1191         prevBucketPtr = &segp[segment_ndx];
1192         currBucket = *prevBucketPtr;
1193
1194         /*
1195          * Follow collision chain looking for matching key
1196          */
1197         match = hashp->match;           /* save one fetch in inner loop */
1198         keysize = hashp->keysize;       /* ditto */
1199
1200         while (currBucket != NULL)
1201         {
1202                 if (currBucket->hashvalue == newhashvalue &&
1203                         match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1204                         break;
1205                 prevBucketPtr = &(currBucket->link);
1206                 currBucket = *prevBucketPtr;
1207 #if HASH_STATISTICS
1208                 hash_collisions++;
1209                 hctl->collisions++;
1210 #endif
1211         }
1212
1213         if (currBucket != NULL)
1214                 return false;                   /* collision with an existing entry */
1215
1216         currBucket = existingElement;
1217
1218         /*
1219          * If old and new hash values belong to the same bucket, we need not
1220          * change any chain links, and indeed should not since this simplistic
1221          * update will corrupt the list if currBucket is the last element.  (We
1222          * cannot fall out earlier, however, since we need to scan the bucket to
1223          * check for duplicate keys.)
1224          */
1225         if (bucket != newbucket)
1226         {
1227                 /* OK to remove record from old hash bucket's chain. */
1228                 *oldPrevPtr = currBucket->link;
1229
1230                 /* link into new hashbucket chain */
1231                 *prevBucketPtr = currBucket;
1232                 currBucket->link = NULL;
1233         }
1234
1235         /* copy new key into record */
1236         currBucket->hashvalue = newhashvalue;
1237         hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1238
1239         /* rest of record is untouched */
1240
1241         return true;
1242 }
1243
1244 /*
1245  * Allocate a new hashtable entry if possible; return NULL if out of memory.
1246  * (Or, if the underlying space allocator throws error for out-of-memory,
1247  * we won't return at all.)
1248  */
1249 static HASHBUCKET
1250 get_hash_entry(HTAB *hashp, int freelist_idx)
1251 {
1252         HASHHDR    *hctl = hashp->hctl;
1253         HASHBUCKET      newElement;
1254
1255         for (;;)
1256         {
1257                 /* if partitioned, must lock to touch nentries and freeList */
1258                 if (IS_PARTITIONED(hctl))
1259                         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1260
1261                 /* try to get an entry from the freelist */
1262                 newElement = hctl->freeList[freelist_idx].freeList;
1263
1264                 if (newElement != NULL)
1265                         break;
1266
1267                 if (IS_PARTITIONED(hctl))
1268                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1269
1270                 /*
1271                  * No free elements in this freelist.  In a partitioned table, there
1272                  * might be entries in other freelists, but to reduce contention we
1273                  * prefer to first try to get another chunk of buckets from the main
1274                  * shmem allocator.  If that fails, though, we *MUST* root through all
1275                  * the other freelists before giving up.  There are multiple callers
1276                  * that assume that they can allocate every element in the initially
1277                  * requested table size, or that deleting an element guarantees they
1278                  * can insert a new element, even if shared memory is entirely full.
1279                  * Failing because the needed element is in a different freelist is
1280                  * not acceptable.
1281                  */
1282                 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1283                 {
1284                         int                     borrow_from_idx;
1285
1286                         if (!IS_PARTITIONED(hctl))
1287                                 return NULL;    /* out of memory */
1288
1289                         /* try to borrow element from another freelist */
1290                         borrow_from_idx = freelist_idx;
1291                         for (;;)
1292                         {
1293                                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1294                                 if (borrow_from_idx == freelist_idx)
1295                                         break;          /* examined all freelists, fail */
1296
1297                                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1298                                 newElement = hctl->freeList[borrow_from_idx].freeList;
1299
1300                                 if (newElement != NULL)
1301                                 {
1302                                         hctl->freeList[borrow_from_idx].freeList = newElement->link;
1303                                         SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1304
1305                                         /* careful: count the new element in its proper freelist */
1306                                         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1307                                         hctl->freeList[freelist_idx].nentries++;
1308                                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1309
1310                                         return newElement;
1311                                 }
1312
1313                                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1314                         }
1315
1316                         /* no elements available to borrow either, so out of memory */
1317                         return NULL;
1318                 }
1319         }
1320
1321         /* remove entry from freelist, bump nentries */
1322         hctl->freeList[freelist_idx].freeList = newElement->link;
1323         hctl->freeList[freelist_idx].nentries++;
1324
1325         if (IS_PARTITIONED(hctl))
1326                 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1327
1328         return newElement;
1329 }
1330
1331 /*
1332  * hash_get_num_entries -- get the number of entries in a hashtable
1333  */
1334 long
1335 hash_get_num_entries(HTAB *hashp)
1336 {
1337         int                     i;
1338         long            sum = hashp->hctl->freeList[0].nentries;
1339
1340         /*
1341          * We currently don't bother with acquiring the mutexes; it's only
1342          * sensible to call this function if you've got lock on all partitions of
1343          * the table.
1344          */
1345         if (IS_PARTITIONED(hashp->hctl))
1346         {
1347                 for (i = 1; i < NUM_FREELISTS; i++)
1348                         sum += hashp->hctl->freeList[i].nentries;
1349         }
1350
1351         return sum;
1352 }
1353
1354 /*
1355  * hash_seq_init/_search/_term
1356  *                      Sequentially search through hash table and return
1357  *                      all the elements one by one, return NULL when no more.
1358  *
1359  * hash_seq_term should be called if and only if the scan is abandoned before
1360  * completion; if hash_seq_search returns NULL then it has already done the
1361  * end-of-scan cleanup.
1362  *
1363  * NOTE: caller may delete the returned element before continuing the scan.
1364  * However, deleting any other element while the scan is in progress is
1365  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
1366  * if elements are added to the table while the scan is in progress, it is
1367  * unspecified whether they will be visited by the scan or not.
1368  *
1369  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1370  * worry about hash_seq_term cleanup, if the hashtable is first locked against
1371  * further insertions by calling hash_freeze.
1372  *
1373  * NOTE: to use this with a partitioned hashtable, caller had better hold
1374  * at least shared lock on all partitions of the table throughout the scan!
1375  * We can cope with insertions or deletions by our own backend, but *not*
1376  * with concurrent insertions or deletions by another.
1377  */
1378 void
1379 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1380 {
1381         status->hashp = hashp;
1382         status->curBucket = 0;
1383         status->curEntry = NULL;
1384         if (!hashp->frozen)
1385                 register_seq_scan(hashp);
1386 }
1387
1388 void *
1389 hash_seq_search(HASH_SEQ_STATUS *status)
1390 {
1391         HTAB       *hashp;
1392         HASHHDR    *hctl;
1393         uint32          max_bucket;
1394         long            ssize;
1395         long            segment_num;
1396         long            segment_ndx;
1397         HASHSEGMENT segp;
1398         uint32          curBucket;
1399         HASHELEMENT *curElem;
1400
1401         if ((curElem = status->curEntry) != NULL)
1402         {
1403                 /* Continuing scan of curBucket... */
1404                 status->curEntry = curElem->link;
1405                 if (status->curEntry == NULL)   /* end of this bucket */
1406                         ++status->curBucket;
1407                 return (void *) ELEMENTKEY(curElem);
1408         }
1409
1410         /*
1411          * Search for next nonempty bucket starting at curBucket.
1412          */
1413         curBucket = status->curBucket;
1414         hashp = status->hashp;
1415         hctl = hashp->hctl;
1416         ssize = hashp->ssize;
1417         max_bucket = hctl->max_bucket;
1418
1419         if (curBucket > max_bucket)
1420         {
1421                 hash_seq_term(status);
1422                 return NULL;                    /* search is done */
1423         }
1424
1425         /*
1426          * first find the right segment in the table directory.
1427          */
1428         segment_num = curBucket >> hashp->sshift;
1429         segment_ndx = MOD(curBucket, ssize);
1430
1431         segp = hashp->dir[segment_num];
1432
1433         /*
1434          * Pick up the first item in this bucket's chain.  If chain is not empty
1435          * we can begin searching it.  Otherwise we have to advance to find the
1436          * next nonempty bucket.  We try to optimize that case since searching a
1437          * near-empty hashtable has to iterate this loop a lot.
1438          */
1439         while ((curElem = segp[segment_ndx]) == NULL)
1440         {
1441                 /* empty bucket, advance to next */
1442                 if (++curBucket > max_bucket)
1443                 {
1444                         status->curBucket = curBucket;
1445                         hash_seq_term(status);
1446                         return NULL;            /* search is done */
1447                 }
1448                 if (++segment_ndx >= ssize)
1449                 {
1450                         segment_num++;
1451                         segment_ndx = 0;
1452                         segp = hashp->dir[segment_num];
1453                 }
1454         }
1455
1456         /* Begin scan of curBucket... */
1457         status->curEntry = curElem->link;
1458         if (status->curEntry == NULL)   /* end of this bucket */
1459                 ++curBucket;
1460         status->curBucket = curBucket;
1461         return (void *) ELEMENTKEY(curElem);
1462 }
1463
1464 void
1465 hash_seq_term(HASH_SEQ_STATUS *status)
1466 {
1467         if (!status->hashp->frozen)
1468                 deregister_seq_scan(status->hashp);
1469 }
1470
1471 /*
1472  * hash_freeze
1473  *                      Freeze a hashtable against future insertions (deletions are
1474  *                      still allowed)
1475  *
1476  * The reason for doing this is that by preventing any more bucket splits,
1477  * we no longer need to worry about registering hash_seq_search scans,
1478  * and thus caller need not be careful about ensuring hash_seq_term gets
1479  * called at the right times.
1480  *
1481  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1482  * with active scans (since hash_seq_term would then do the wrong thing).
1483  */
1484 void
1485 hash_freeze(HTAB *hashp)
1486 {
1487         if (hashp->isshared)
1488                 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1489         if (!hashp->frozen && has_seq_scans(hashp))
1490                 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1491                          hashp->tabname);
1492         hashp->frozen = true;
1493 }
1494
1495
1496 /********************************* UTILITIES ************************/
1497
1498 /*
1499  * Expand the table by adding one more hash bucket.
1500  */
1501 static bool
1502 expand_table(HTAB *hashp)
1503 {
1504         HASHHDR    *hctl = hashp->hctl;
1505         HASHSEGMENT old_seg,
1506                                 new_seg;
1507         long            old_bucket,
1508                                 new_bucket;
1509         long            new_segnum,
1510                                 new_segndx;
1511         long            old_segnum,
1512                                 old_segndx;
1513         HASHBUCKET *oldlink,
1514                            *newlink;
1515         HASHBUCKET      currElement,
1516                                 nextElement;
1517
1518         Assert(!IS_PARTITIONED(hctl));
1519
1520 #ifdef HASH_STATISTICS
1521         hash_expansions++;
1522 #endif
1523
1524         new_bucket = hctl->max_bucket + 1;
1525         new_segnum = new_bucket >> hashp->sshift;
1526         new_segndx = MOD(new_bucket, hashp->ssize);
1527
1528         if (new_segnum >= hctl->nsegs)
1529         {
1530                 /* Allocate new segment if necessary -- could fail if dir full */
1531                 if (new_segnum >= hctl->dsize)
1532                         if (!dir_realloc(hashp))
1533                                 return false;
1534                 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1535                         return false;
1536                 hctl->nsegs++;
1537         }
1538
1539         /* OK, we created a new bucket */
1540         hctl->max_bucket++;
1541
1542         /*
1543          * *Before* changing masks, find old bucket corresponding to same hash
1544          * values; values in that bucket may need to be relocated to new bucket.
1545          * Note that new_bucket is certainly larger than low_mask at this point,
1546          * so we can skip the first step of the regular hash mask calc.
1547          */
1548         old_bucket = (new_bucket & hctl->low_mask);
1549
1550         /*
1551          * If we crossed a power of 2, readjust masks.
1552          */
1553         if ((uint32) new_bucket > hctl->high_mask)
1554         {
1555                 hctl->low_mask = hctl->high_mask;
1556                 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1557         }
1558
1559         /*
1560          * Relocate records to the new bucket.  NOTE: because of the way the hash
1561          * masking is done in calc_bucket, only one old bucket can need to be
1562          * split at this point.  With a different way of reducing the hash value,
1563          * that might not be true!
1564          */
1565         old_segnum = old_bucket >> hashp->sshift;
1566         old_segndx = MOD(old_bucket, hashp->ssize);
1567
1568         old_seg = hashp->dir[old_segnum];
1569         new_seg = hashp->dir[new_segnum];
1570
1571         oldlink = &old_seg[old_segndx];
1572         newlink = &new_seg[new_segndx];
1573
1574         for (currElement = *oldlink;
1575                  currElement != NULL;
1576                  currElement = nextElement)
1577         {
1578                 nextElement = currElement->link;
1579                 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1580                 {
1581                         *oldlink = currElement;
1582                         oldlink = &currElement->link;
1583                 }
1584                 else
1585                 {
1586                         *newlink = currElement;
1587                         newlink = &currElement->link;
1588                 }
1589         }
1590         /* don't forget to terminate the rebuilt hash chains... */
1591         *oldlink = NULL;
1592         *newlink = NULL;
1593
1594         return true;
1595 }
1596
1597
1598 static bool
1599 dir_realloc(HTAB *hashp)
1600 {
1601         HASHSEGMENT *p;
1602         HASHSEGMENT *old_p;
1603         long            new_dsize;
1604         long            old_dirsize;
1605         long            new_dirsize;
1606
1607         if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1608                 return false;
1609
1610         /* Reallocate directory */
1611         new_dsize = hashp->hctl->dsize << 1;
1612         old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1613         new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1614
1615         old_p = hashp->dir;
1616         CurrentDynaHashCxt = hashp->hcxt;
1617         p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1618
1619         if (p != NULL)
1620         {
1621                 memcpy(p, old_p, old_dirsize);
1622                 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1623                 hashp->dir = p;
1624                 hashp->hctl->dsize = new_dsize;
1625
1626                 /* XXX assume the allocator is palloc, so we know how to free */
1627                 Assert(hashp->alloc == DynaHashAlloc);
1628                 pfree(old_p);
1629
1630                 return true;
1631         }
1632
1633         return false;
1634 }
1635
1636
1637 static HASHSEGMENT
1638 seg_alloc(HTAB *hashp)
1639 {
1640         HASHSEGMENT segp;
1641
1642         CurrentDynaHashCxt = hashp->hcxt;
1643         segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1644
1645         if (!segp)
1646                 return NULL;
1647
1648         MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1649
1650         return segp;
1651 }
1652
1653 /*
1654  * allocate some new elements and link them into the indicated free list
1655  */
1656 static bool
1657 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1658 {
1659         HASHHDR    *hctl = hashp->hctl;
1660         Size            elementSize;
1661         HASHELEMENT *firstElement;
1662         HASHELEMENT *tmpElement;
1663         HASHELEMENT *prevElement;
1664         int                     i;
1665
1666         if (hashp->isfixed)
1667                 return false;
1668
1669         /* Each element has a HASHELEMENT header plus user data. */
1670         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1671
1672         CurrentDynaHashCxt = hashp->hcxt;
1673         firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1674
1675         if (!firstElement)
1676                 return false;
1677
1678         /* prepare to link all the new entries into the freelist */
1679         prevElement = NULL;
1680         tmpElement = firstElement;
1681         for (i = 0; i < nelem; i++)
1682         {
1683                 tmpElement->link = prevElement;
1684                 prevElement = tmpElement;
1685                 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1686         }
1687
1688         /* if partitioned, must lock to touch freeList */
1689         if (IS_PARTITIONED(hctl))
1690                 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1691
1692         /* freelist could be nonempty if two backends did this concurrently */
1693         firstElement->link = hctl->freeList[freelist_idx].freeList;
1694         hctl->freeList[freelist_idx].freeList = prevElement;
1695
1696         if (IS_PARTITIONED(hctl))
1697                 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1698
1699         return true;
1700 }
1701
1702 /* complain when we have detected a corrupted hashtable */
1703 static void
1704 hash_corrupted(HTAB *hashp)
1705 {
1706         /*
1707          * If the corruption is in a shared hashtable, we'd better force a
1708          * systemwide restart.  Otherwise, just shut down this one backend.
1709          */
1710         if (hashp->isshared)
1711                 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1712         else
1713                 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1714 }
1715
1716 /* calculate ceil(log base 2) of num */
1717 int
1718 my_log2(long num)
1719 {
1720         int                     i;
1721         long            limit;
1722
1723         /* guard against too-large input, which would put us into infinite loop */
1724         if (num > LONG_MAX / 2)
1725                 num = LONG_MAX / 2;
1726
1727         for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1728                 ;
1729         return i;
1730 }
1731
1732 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1733 static long
1734 next_pow2_long(long num)
1735 {
1736         /* my_log2's internal range check is sufficient */
1737         return 1L << my_log2(num);
1738 }
1739
1740 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1741 static int
1742 next_pow2_int(long num)
1743 {
1744         if (num > INT_MAX / 2)
1745                 num = INT_MAX / 2;
1746         return 1 << my_log2(num);
1747 }
1748
1749
1750 /************************* SEQ SCAN TRACKING ************************/
1751
1752 /*
1753  * We track active hash_seq_search scans here.  The need for this mechanism
1754  * comes from the fact that a scan will get confused if a bucket split occurs
1755  * while it's in progress: it might visit entries twice, or even miss some
1756  * entirely (if it's partway through the same bucket that splits).  Hence
1757  * we want to inhibit bucket splits if there are any active scans on the
1758  * table being inserted into.  This is a fairly rare case in current usage,
1759  * so just postponing the split until the next insertion seems sufficient.
1760  *
1761  * Given present usages of the function, only a few scans are likely to be
1762  * open concurrently; so a finite-size stack of open scans seems sufficient,
1763  * and we don't worry that linear search is too slow.  Note that we do
1764  * allow multiple scans of the same hashtable to be open concurrently.
1765  *
1766  * This mechanism can support concurrent scan and insertion in a shared
1767  * hashtable if it's the same backend doing both.  It would fail otherwise,
1768  * but locking reasons seem to preclude any such scenario anyway, so we don't
1769  * worry.
1770  *
1771  * This arrangement is reasonably robust if a transient hashtable is deleted
1772  * without notifying us.  The absolute worst case is we might inhibit splits
1773  * in another table created later at exactly the same address.  We will give
1774  * a warning at transaction end for reference leaks, so any bugs leading to
1775  * lack of notification should be easy to catch.
1776  */
1777
1778 #define MAX_SEQ_SCANS 100
1779
1780 static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
1781 static int      seq_scan_level[MAX_SEQ_SCANS];  /* subtransaction nest level */
1782 static int      num_seq_scans = 0;
1783
1784
1785 /* Register a table as having an active hash_seq_search scan */
1786 static void
1787 register_seq_scan(HTAB *hashp)
1788 {
1789         if (num_seq_scans >= MAX_SEQ_SCANS)
1790                 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1791                          hashp->tabname);
1792         seq_scan_tables[num_seq_scans] = hashp;
1793         seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1794         num_seq_scans++;
1795 }
1796
1797 /* Deregister an active scan */
1798 static void
1799 deregister_seq_scan(HTAB *hashp)
1800 {
1801         int                     i;
1802
1803         /* Search backward since it's most likely at the stack top */
1804         for (i = num_seq_scans - 1; i >= 0; i--)
1805         {
1806                 if (seq_scan_tables[i] == hashp)
1807                 {
1808                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1809                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1810                         num_seq_scans--;
1811                         return;
1812                 }
1813         }
1814         elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1815                  hashp->tabname);
1816 }
1817
1818 /* Check if a table has any active scan */
1819 static bool
1820 has_seq_scans(HTAB *hashp)
1821 {
1822         int                     i;
1823
1824         for (i = 0; i < num_seq_scans; i++)
1825         {
1826                 if (seq_scan_tables[i] == hashp)
1827                         return true;
1828         }
1829         return false;
1830 }
1831
1832 /* Clean up any open scans at end of transaction */
1833 void
1834 AtEOXact_HashTables(bool isCommit)
1835 {
1836         /*
1837          * During abort cleanup, open scans are expected; just silently clean 'em
1838          * out.  An open scan at commit means someone forgot a hash_seq_term()
1839          * call, so complain.
1840          *
1841          * Note: it's tempting to try to print the tabname here, but refrain for
1842          * fear of touching deallocated memory.  This isn't a user-facing message
1843          * anyway, so it needn't be pretty.
1844          */
1845         if (isCommit)
1846         {
1847                 int                     i;
1848
1849                 for (i = 0; i < num_seq_scans; i++)
1850                 {
1851                         elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1852                                  seq_scan_tables[i]);
1853                 }
1854         }
1855         num_seq_scans = 0;
1856 }
1857
1858 /* Clean up any open scans at end of subtransaction */
1859 void
1860 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1861 {
1862         int                     i;
1863
1864         /*
1865          * Search backward to make cleanup easy.  Note we must check all entries,
1866          * not only those at the end of the array, because deletion technique
1867          * doesn't keep them in order.
1868          */
1869         for (i = num_seq_scans - 1; i >= 0; i--)
1870         {
1871                 if (seq_scan_level[i] >= nestDepth)
1872                 {
1873                         if (isCommit)
1874                                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1875                                          seq_scan_tables[i]);
1876                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1877                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1878                         num_seq_scans--;
1879                 }
1880         }
1881 }