granicus.if.org Git - postgresql/blob - src/backend/utils/hash/dynahash.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dynahash.c
   4  *        dynamic hash tables
   5  *
   6  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
   7  * shared memory.  For shared hash tables, it is the caller's responsibility
   8  * to provide appropriate access interlocking.  The simplest convention is
   9  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
  10  * hash_seq_search) need only shared lock, but any update requires exclusive
  11  * lock.  For heavily-used shared tables, the single-lock approach creates a
  12  * concurrency bottleneck, so we also support "partitioned" locking wherein
  13  * there are multiple LWLocks guarding distinct subsets of the table.  To use
  14  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
  15  * to hash_create.      This prevents any attempt to split buckets on-the-fly.
  16  * Therefore, each hash bucket chain operates independently, and no fields
  17  * of the hash header change after init except nentries and freeList.
  18  * A partitioned table uses a spinlock to guard changes of those two fields.
  19  * This lets any subset of the hash buckets be treated as a separately
  20  * lockable partition.  We expect callers to use the low-order bits of a
  21  * lookup key's hash value as a partition number --- this will work because
  22  * of the way calc_bucket() maps hash values to bucket numbers.
  23  *
  24  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  25  * Portions Copyright (c) 1994, Regents of the University of California
  26  *
  27  *
  28  * IDENTIFICATION
  29  *        $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.79 2009/01/01 17:23:51 momjian Exp $
  30  *
  31  *-------------------------------------------------------------------------
  32  */
  33
  34 /*
  35  * Original comments:
  36  *
  37  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
  38  * Coded into C, with minor code improvements, and with hsearch(3) interface,
  39  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
  40  * also, hcreate/hdestroy routines added to simulate hsearch(3).
  41  *
  42  * These routines simulate hsearch(3) and family, with the important
  43  * difference that the hash table is dynamic - can grow indefinitely
  44  * beyond its original size (as supplied to hcreate()).
  45  *
  46  * Performance appears to be comparable to that of hsearch(3).
  47  * The 'source-code' options referred to in hsearch(3)'s 'man' page
  48  * are not implemented; otherwise functionality is identical.
  49  *
  50  * Compilation controls:
  51  * DEBUG controls some informative traces, mainly for debugging.
  52  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
  53  * when combined with HASH_DEBUG, these are displayed by hdestroy().
  54  *
  55  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
  56  * concatenation property, in probably unnecessary code 'optimisation'.
  57  *
  58  * Modified margo@postgres.berkeley.edu February 1990
  59  *              added multiple table interface
  60  * Modified by sullivan@postgres.berkeley.edu April 1990
  61  *              changed ctl structure for shared memory
  62  */
  63
  64 #include "postgres.h"
  65
  66 #include "access/xact.h"
  67 #include "storage/shmem.h"
  68 #include "storage/spin.h"
  69 #include "utils/dynahash.h"
  70 #include "utils/memutils.h"
  71
  72
  73 /*
  74  * Constants
  75  *
  76  * A hash table has a top-level "directory", each of whose entries points
  77  * to a "segment" of ssize bucket headers.      The maximum number of hash
  78  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
  79  * the number of records in the table can be larger, but we don't want a
  80  * whole lot of records per bucket or performance goes down.
  81  *
  82  * In a hash table allocated in shared memory, the directory cannot be
  83  * expanded because it must stay at a fixed address.  The directory size
  84  * should be selected using hash_select_dirsize (and you'd better have
  85  * a good idea of the maximum number of entries!).      For non-shared hash
  86  * tables, the initial directory size can be left at the default.
  87  */
  88 #define DEF_SEGSIZE                        256
  89 #define DEF_SEGSIZE_SHIFT          8    /* must be log2(DEF_SEGSIZE) */
  90 #define DEF_DIRSIZE                        256
  91 #define DEF_FFACTOR                        1    /* default fill factor */
  92
  93
  94 /* A hash bucket is a linked list of HASHELEMENTs */
  95 typedef HASHELEMENT *HASHBUCKET;
  96
  97 /* A hash segment is an array of bucket headers */
  98 typedef HASHBUCKET *HASHSEGMENT;
  99
 100 /*
 101  * Header structure for a hash table --- contains all changeable info
 102  *
 103  * In a shared-memory hash table, the HASHHDR is in shared memory, while
 104  * each backend has a local HTAB struct.  For a non-shared table, there isn't
 105  * any functional difference between HASHHDR and HTAB, but we separate them
 106  * anyway to share code between shared and non-shared tables.
 107  */
 108 struct HASHHDR
 109 {
 110         /* In a partitioned table, take this lock to touch nentries or freeList */
 111         slock_t         mutex;                  /* unused if not partitioned table */
 112
 113         /* These fields change during entry addition/deletion */
 114         long            nentries;               /* number of entries in hash table */
 115         HASHELEMENT *freeList;          /* linked list of free elements */
 116
 117         /* These fields can change, but not in a partitioned table */
 118         /* Also, dsize can't change in a shared table, even if unpartitioned */
 119         long            dsize;                  /* directory size */
 120         long            nsegs;                  /* number of allocated segments (<= dsize) */
 121         uint32          max_bucket;             /* ID of maximum bucket in use */
 122         uint32          high_mask;              /* mask to modulo into entire table */
 123         uint32          low_mask;               /* mask to modulo into lower half of table */
 124
 125         /* These fields are fixed at hashtable creation */
 126         Size            keysize;                /* hash key length in bytes */
 127         Size            entrysize;              /* total user element size in bytes */
 128         long            num_partitions; /* # partitions (must be power of 2), or 0 */
 129         long            ffactor;                /* target fill factor */
 130         long            max_dsize;              /* 'dsize' limit if directory is fixed size */
 131         long            ssize;                  /* segment size --- must be power of 2 */
 132         int                     sshift;                 /* segment shift = log2(ssize) */
 133         int                     nelem_alloc;    /* number of entries to allocate at once */
 134
 135 #ifdef HASH_STATISTICS
 136
 137         /*
 138          * Count statistics here.  NB: stats code doesn't bother with mutex, so
 139          * counts could be corrupted a bit in a partitioned table.
 140          */
 141         long            accesses;
 142         long            collisions;
 143 #endif
 144 };
 145
 146 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 147
 148 /*
 149  * Top control structure for a hashtable --- in a shared table, each backend
 150  * has its own copy (OK since no fields change at runtime)
 151  */
 152 struct HTAB
 153 {
 154         HASHHDR    *hctl;                       /* => shared control information */
 155         HASHSEGMENT *dir;                       /* directory of segment starts */
 156         HashValueFunc hash;                     /* hash function */
 157         HashCompareFunc match;          /* key comparison function */
 158         HashCopyFunc keycopy;           /* key copying function */
 159         HashAllocFunc alloc;            /* memory allocator */
 160         MemoryContext hcxt;                     /* memory context if default allocator used */
 161         char       *tabname;            /* table name (for error messages) */
 162         bool            isshared;               /* true if table is in shared memory */
 163
 164         /* freezing a shared table isn't allowed, so we can keep state here */
 165         bool            frozen;                 /* true = no more inserts allowed */
 166
 167         /* We keep local copies of these fixed values to reduce contention */
 168         Size            keysize;                /* hash key length in bytes */
 169         long            ssize;                  /* segment size --- must be power of 2 */
 170         int                     sshift;                 /* segment shift = log2(ssize) */
 171 };
 172
 173 /*
 174  * Key (also entry) part of a HASHELEMENT
 175  */
 176 #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
 177
 178 /*
 179  * Fast MOD arithmetic, assuming that y is a power of 2 !
 180  */
 181 #define MOD(x,y)                           ((x) & ((y)-1))
 182
 183 #if HASH_STATISTICS
 184 static long hash_accesses,
 185                         hash_collisions,
 186                         hash_expansions;
 187 #endif
 188
 189 /*
 190  * Private function prototypes
 191  */
 192 static void *DynaHashAlloc(Size size);
 193 static HASHSEGMENT seg_alloc(HTAB *hashp);
 194 static bool element_alloc(HTAB *hashp, int nelem);
 195 static bool dir_realloc(HTAB *hashp);
 196 static bool expand_table(HTAB *hashp);
 197 static HASHBUCKET get_hash_entry(HTAB *hashp);
 198 static void hdefault(HTAB *hashp);
 199 static int      choose_nelem_alloc(Size entrysize);
 200 static bool init_htab(HTAB *hashp, long nelem);
 201 static void hash_corrupted(HTAB *hashp);
 202 static void register_seq_scan(HTAB *hashp);
 203 static void deregister_seq_scan(HTAB *hashp);
 204 static bool has_seq_scans(HTAB *hashp);
 205
 206
 207 /*
 208  * memory allocation support
 209  */
 210 static MemoryContext CurrentDynaHashCxt = NULL;
 211
 212 static void *
 213 DynaHashAlloc(Size size)
 214 {
 215         Assert(MemoryContextIsValid(CurrentDynaHashCxt));
 216         return MemoryContextAlloc(CurrentDynaHashCxt, size);
 217 }
 218
 219
 220 /*
 221  * HashCompareFunc for string keys
 222  *
 223  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
 224  * bytes, so we can only compare that many ... hence strncmp is almost but
 225  * not quite the right thing.
 226  */
 227 static int
 228 string_compare(const char *key1, const char *key2, Size keysize)
 229 {
 230         return strncmp(key1, key2, keysize - 1);
 231 }
 232
 233
 234 /************************** CREATE ROUTINES **********************/
 235
 236 /*
 237  * hash_create -- create a new dynamic hash table
 238  *
 239  *      tabname: a name for the table (for debugging purposes)
 240  *      nelem: maximum number of elements expected
 241  *      *info: additional table parameters, as indicated by flags
 242  *      flags: bitmask indicating which parameters to take from *info
 243  *
 244  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
 245  * estimate, since we can't expand the table on the fly.  But an unshared
 246  * hashtable can be expanded on-the-fly, so it's better for nelem to be
 247  * on the small side and let the table grow if it's exceeded.  An overly
 248  * large nelem will penalize hash_seq_search speed without buying much.
 249  */
 250 HTAB *
 251 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 252 {
 253         HTAB       *hashp;
 254         HASHHDR    *hctl;
 255
 256         /*
 257          * For shared hash tables, we have a local hash header (HTAB struct) that
 258          * we allocate in TopMemoryContext; all else is in shared memory.
 259          *
 260          * For non-shared hash tables, everything including the hash header is in
 261          * a memory context created specially for the hash table --- this makes
 262          * hash_destroy very simple.  The memory context is made a child of either
 263          * a context specified by the caller, or TopMemoryContext if nothing is
 264          * specified.
 265          */
 266         if (flags & HASH_SHARED_MEM)
 267         {
 268                 /* Set up to allocate the hash header */
 269                 CurrentDynaHashCxt = TopMemoryContext;
 270         }
 271         else
 272         {
 273                 /* Create the hash table's private memory context */
 274                 if (flags & HASH_CONTEXT)
 275                         CurrentDynaHashCxt = info->hcxt;
 276                 else
 277                         CurrentDynaHashCxt = TopMemoryContext;
 278                 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
 279                                                                                                    tabname,
 280                                                                                                    ALLOCSET_DEFAULT_MINSIZE,
 281                                                                                                    ALLOCSET_DEFAULT_INITSIZE,
 282                                                                                                    ALLOCSET_DEFAULT_MAXSIZE);
 283         }
 284
 285         /* Initialize the hash header, plus a copy of the table name */
 286         hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1);
 287         MemSet(hashp, 0, sizeof(HTAB));
 288
 289         hashp->tabname = (char *) (hashp + 1);
 290         strcpy(hashp->tabname, tabname);
 291
 292         if (flags & HASH_FUNCTION)
 293                 hashp->hash = info->hash;
 294         else
 295                 hashp->hash = string_hash;              /* default hash function */
 296
 297         /*
 298          * If you don't specify a match function, it defaults to string_compare if
 299          * you used string_hash (either explicitly or by default) and to memcmp
 300          * otherwise.  (Prior to PostgreSQL 7.4, memcmp was always used.)
 301          */
 302         if (flags & HASH_COMPARE)
 303                 hashp->match = info->match;
 304         else if (hashp->hash == string_hash)
 305                 hashp->match = (HashCompareFunc) string_compare;
 306         else
 307                 hashp->match = memcmp;
 308
 309         /*
 310          * Similarly, the key-copying function defaults to strlcpy or memcpy.
 311          */
 312         if (flags & HASH_KEYCOPY)
 313                 hashp->keycopy = info->keycopy;
 314         else if (hashp->hash == string_hash)
 315                 hashp->keycopy = (HashCopyFunc) strlcpy;
 316         else
 317                 hashp->keycopy = memcpy;
 318
 319         if (flags & HASH_ALLOC)
 320                 hashp->alloc = info->alloc;
 321         else
 322                 hashp->alloc = DynaHashAlloc;
 323
 324         if (flags & HASH_SHARED_MEM)
 325         {
 326                 /*
 327                  * ctl structure and directory are preallocated for shared memory
 328                  * tables.      Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
 329                  * well.
 330                  */
 331                 hashp->hctl = info->hctl;
 332                 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 333                 hashp->hcxt = NULL;
 334                 hashp->isshared = true;
 335
 336                 /* hash table already exists, we're just attaching to it */
 337                 if (flags & HASH_ATTACH)
 338                 {
 339                         /* make local copies of some heavily-used values */
 340                         hctl = hashp->hctl;
 341                         hashp->keysize = hctl->keysize;
 342                         hashp->ssize = hctl->ssize;
 343                         hashp->sshift = hctl->sshift;
 344
 345                         return hashp;
 346                 }
 347         }
 348         else
 349         {
 350                 /* setup hash table defaults */
 351                 hashp->hctl = NULL;
 352                 hashp->dir = NULL;
 353                 hashp->hcxt = CurrentDynaHashCxt;
 354                 hashp->isshared = false;
 355         }
 356
 357         if (!hashp->hctl)
 358         {
 359                 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
 360                 if (!hashp->hctl)
 361                         ereport(ERROR,
 362                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 363                                          errmsg("out of memory")));
 364         }
 365
 366         hashp->frozen = false;
 367
 368         hdefault(hashp);
 369
 370         hctl = hashp->hctl;
 371
 372         if (flags & HASH_PARTITION)
 373         {
 374                 /* Doesn't make sense to partition a local hash table */
 375                 Assert(flags & HASH_SHARED_MEM);
 376                 /* # of partitions had better be a power of 2 */
 377                 Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
 378
 379                 hctl->num_partitions = info->num_partitions;
 380         }
 381
 382         if (flags & HASH_SEGMENT)
 383         {
 384                 hctl->ssize = info->ssize;
 385                 hctl->sshift = my_log2(info->ssize);
 386                 /* ssize had better be a power of 2 */
 387                 Assert(hctl->ssize == (1L << hctl->sshift));
 388         }
 389         if (flags & HASH_FFACTOR)
 390                 hctl->ffactor = info->ffactor;
 391
 392         /*
 393          * SHM hash tables have fixed directory size passed by the caller.
 394          */
 395         if (flags & HASH_DIRSIZE)
 396         {
 397                 hctl->max_dsize = info->max_dsize;
 398                 hctl->dsize = info->dsize;
 399         }
 400
 401         /*
 402          * hash table now allocates space for key and data but you have to say how
 403          * much space to allocate
 404          */
 405         if (flags & HASH_ELEM)
 406         {
 407                 Assert(info->entrysize >= info->keysize);
 408                 hctl->keysize = info->keysize;
 409                 hctl->entrysize = info->entrysize;
 410         }
 411
 412         /* make local copies of heavily-used constant fields */
 413         hashp->keysize = hctl->keysize;
 414         hashp->ssize = hctl->ssize;
 415         hashp->sshift = hctl->sshift;
 416
 417         /* Build the hash directory structure */
 418         if (!init_htab(hashp, nelem))
 419                 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
 420
 421         /*
 422          * For a shared hash table, preallocate the requested number of elements.
 423          * This reduces problems with run-time out-of-shared-memory conditions.
 424          *
 425          * For a non-shared hash table, preallocate the requested number of
 426          * elements if it's less than our chosen nelem_alloc.  This avoids wasting
 427          * space if the caller correctly estimates a small table size.
 428          */
 429         if ((flags & HASH_SHARED_MEM) ||
 430                 nelem < hctl->nelem_alloc)
 431         {
 432                 if (!element_alloc(hashp, (int) nelem))
 433                         ereport(ERROR,
 434                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 435                                          errmsg("out of memory")));
 436         }
 437
 438         return hashp;
 439 }
 440
 441 /*
 442  * Set default HASHHDR parameters.
 443  */
 444 static void
 445 hdefault(HTAB *hashp)
 446 {
 447         HASHHDR    *hctl = hashp->hctl;
 448
 449         MemSet(hctl, 0, sizeof(HASHHDR));
 450
 451         hctl->nentries = 0;
 452         hctl->freeList = NULL;
 453
 454         hctl->dsize = DEF_DIRSIZE;
 455         hctl->nsegs = 0;
 456
 457         /* rather pointless defaults for key & entry size */
 458         hctl->keysize = sizeof(char *);
 459         hctl->entrysize = 2 * sizeof(char *);
 460
 461         hctl->num_partitions = 0;       /* not partitioned */
 462
 463         hctl->ffactor = DEF_FFACTOR;
 464
 465         /* table has no fixed maximum size */
 466         hctl->max_dsize = NO_MAX_DSIZE;
 467
 468         hctl->ssize = DEF_SEGSIZE;
 469         hctl->sshift = DEF_SEGSIZE_SHIFT;
 470
 471 #ifdef HASH_STATISTICS
 472         hctl->accesses = hctl->collisions = 0;
 473 #endif
 474 }
 475
 476 /*
 477  * Given the user-specified entry size, choose nelem_alloc, ie, how many
 478  * elements to add to the hash table when we need more.
 479  */
 480 static int
 481 choose_nelem_alloc(Size entrysize)
 482 {
 483         int                     nelem_alloc;
 484         Size            elementSize;
 485         Size            allocSize;
 486
 487         /* Each element has a HASHELEMENT header plus user data. */
 488         /* NB: this had better match element_alloc() */
 489         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 490
 491         /*
 492          * The idea here is to choose nelem_alloc at least 32, but round up so
 493          * that the allocation request will be a power of 2 or just less. This
 494          * makes little difference for hash tables in shared memory, but for hash
 495          * tables managed by palloc, the allocation request will be rounded up to
 496          * a power of 2 anyway.  If we fail to take this into account, we'll waste
 497          * as much as half the allocated space.
 498          */
 499         allocSize = 32 * 4;                     /* assume elementSize at least 8 */
 500         do
 501         {
 502                 allocSize <<= 1;
 503                 nelem_alloc = allocSize / elementSize;
 504         } while (nelem_alloc < 32);
 505
 506         return nelem_alloc;
 507 }
 508
 509 /*
 510  * Compute derived fields of hctl and build the initial directory/segment
 511  * arrays
 512  */
 513 static bool
 514 init_htab(HTAB *hashp, long nelem)
 515 {
 516         HASHHDR    *hctl = hashp->hctl;
 517         HASHSEGMENT *segp;
 518         long            lnbuckets;
 519         int                     nbuckets;
 520         int                     nsegs;
 521
 522         /*
 523          * initialize mutex if it's a partitioned table
 524          */
 525         if (IS_PARTITIONED(hctl))
 526                 SpinLockInit(&hctl->mutex);
 527
 528         /*
 529          * Divide number of elements by the fill factor to determine a desired
 530          * number of buckets.  Allocate space for the next greater power of two
 531          * number of buckets
 532          */
 533         lnbuckets = (nelem - 1) / hctl->ffactor + 1;
 534
 535         nbuckets = 1 << my_log2(lnbuckets);
 536
 537         /*
 538          * In a partitioned table, nbuckets must be at least equal to
 539          * num_partitions; were it less, keys with apparently different partition
 540          * numbers would map to the same bucket, breaking partition independence.
 541          * (Normally nbuckets will be much bigger; this is just a safety check.)
 542          */
 543         while (nbuckets < hctl->num_partitions)
 544                 nbuckets <<= 1;
 545
 546         hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 547         hctl->high_mask = (nbuckets << 1) - 1;
 548
 549         /*
 550          * Figure number of directory segments needed, round up to a power of 2
 551          */
 552         nsegs = (nbuckets - 1) / hctl->ssize + 1;
 553         nsegs = 1 << my_log2(nsegs);
 554
 555         /*
 556          * Make sure directory is big enough. If pre-allocated directory is too
 557          * small, choke (caller screwed up).
 558          */
 559         if (nsegs > hctl->dsize)
 560         {
 561                 if (!(hashp->dir))
 562                         hctl->dsize = nsegs;
 563                 else
 564                         return false;
 565         }
 566
 567         /* Allocate a directory */
 568         if (!(hashp->dir))
 569         {
 570                 CurrentDynaHashCxt = hashp->hcxt;
 571                 hashp->dir = (HASHSEGMENT *)
 572                         hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
 573                 if (!hashp->dir)
 574                         return false;
 575         }
 576
 577         /* Allocate initial segments */
 578         for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
 579         {
 580                 *segp = seg_alloc(hashp);
 581                 if (*segp == NULL)
 582                         return false;
 583         }
 584
 585         /* Choose number of entries to allocate at a time */
 586         hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
 587
 588 #if HASH_DEBUG
 589         fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n%s%ld\n",
 590                         "TABLE POINTER   ", hashp,
 591                         "DIRECTORY SIZE  ", hctl->dsize,
 592                         "SEGMENT SIZE    ", hctl->ssize,
 593                         "SEGMENT SHIFT   ", hctl->sshift,
 594                         "FILL FACTOR     ", hctl->ffactor,
 595                         "MAX BUCKET      ", hctl->max_bucket,
 596                         "HIGH MASK       ", hctl->high_mask,
 597                         "LOW  MASK       ", hctl->low_mask,
 598                         "NSEGS           ", hctl->nsegs,
 599                         "NENTRIES        ", hctl->nentries);
 600 #endif
 601         return true;
 602 }
 603
 604 /*
 605  * Estimate the space needed for a hashtable containing the given number
 606  * of entries of given size.
 607  * NOTE: this is used to estimate the footprint of hashtables in shared
 608  * memory; therefore it does not count HTAB which is in local memory.
 609  * NB: assumes that all hash structure parameters have default values!
 610  */
 611 Size
 612 hash_estimate_size(long num_entries, Size entrysize)
 613 {
 614         Size            size;
 615         long            nBuckets,
 616                                 nSegments,
 617                                 nDirEntries,
 618                                 nElementAllocs,
 619                                 elementSize,
 620                                 elementAllocCnt;
 621
 622         /* estimate number of buckets wanted */
 623         nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
 624         /* # of segments needed for nBuckets */
 625         nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
 626         /* directory entries */
 627         nDirEntries = DEF_DIRSIZE;
 628         while (nDirEntries < nSegments)
 629                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 630
 631         /* fixed control info */
 632         size = MAXALIGN(sizeof(HASHHDR));       /* but not HTAB, per above */
 633         /* directory */
 634         size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
 635         /* segments */
 636         size = add_size(size, mul_size(nSegments,
 637                                                                 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
 638         /* elements --- allocated in groups of choose_nelem_alloc() entries */
 639         elementAllocCnt = choose_nelem_alloc(entrysize);
 640         nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
 641         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 642         size = add_size(size,
 643                                         mul_size(nElementAllocs,
 644                                                          mul_size(elementAllocCnt, elementSize)));
 645
 646         return size;
 647 }
 648
 649 /*
 650  * Select an appropriate directory size for a hashtable with the given
 651  * maximum number of entries.
 652  * This is only needed for hashtables in shared memory, whose directories
 653  * cannot be expanded dynamically.
 654  * NB: assumes that all hash structure parameters have default values!
 655  *
 656  * XXX this had better agree with the behavior of init_htab()...
 657  */
 658 long
 659 hash_select_dirsize(long num_entries)
 660 {
 661         long            nBuckets,
 662                                 nSegments,
 663                                 nDirEntries;
 664
 665         /* estimate number of buckets wanted */
 666         nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
 667         /* # of segments needed for nBuckets */
 668         nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
 669         /* directory entries */
 670         nDirEntries = DEF_DIRSIZE;
 671         while (nDirEntries < nSegments)
 672                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 673
 674         return nDirEntries;
 675 }
 676
 677 /*
 678  * Compute the required initial memory allocation for a shared-memory
 679  * hashtable with the given parameters.  We need space for the HASHHDR
 680  * and for the (non expansible) directory.
 681  */
 682 Size
 683 hash_get_shared_size(HASHCTL *info, int flags)
 684 {
 685         Assert(flags & HASH_DIRSIZE);
 686         Assert(info->dsize == info->max_dsize);
 687         return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
 688 }
 689
 690
 691 /********************** DESTROY ROUTINES ************************/
 692
 693 void
 694 hash_destroy(HTAB *hashp)
 695 {
 696         if (hashp != NULL)
 697         {
 698                 /* allocation method must be one we know how to free, too */
 699                 Assert(hashp->alloc == DynaHashAlloc);
 700                 /* so this hashtable must have it's own context */
 701                 Assert(hashp->hcxt != NULL);
 702
 703                 hash_stats("destroy", hashp);
 704
 705                 /*
 706                  * Free everything by destroying the hash table's memory context.
 707                  */
 708                 MemoryContextDelete(hashp->hcxt);
 709         }
 710 }
 711
 712 void
 713 hash_stats(const char *where, HTAB *hashp)
 714 {
 715 #if HASH_STATISTICS
 716         fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
 717                         where, hashp->hctl->accesses, hashp->hctl->collisions);
 718
 719         fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
 720                         hashp->hctl->nentries, (long) hashp->hctl->keysize,
 721                         hashp->hctl->max_bucket, hashp->hctl->nsegs);
 722         fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 723                         where, hash_accesses, hash_collisions);
 724         fprintf(stderr, "hash_stats: total expansions %ld\n",
 725                         hash_expansions);
 726 #endif
 727 }
 728
 729 /*******************************SEARCH ROUTINES *****************************/
 730
 731
 732 /*
 733  * get_hash_value -- exported routine to calculate a key's hash value
 734  *
 735  * We export this because for partitioned tables, callers need to compute
 736  * the partition number (from the low-order bits of the hash value) before
 737  * searching.
 738  */
 739 uint32
 740 get_hash_value(HTAB *hashp, const void *keyPtr)
 741 {
 742         return hashp->hash(keyPtr, hashp->keysize);
 743 }
 744
 745 /* Convert a hash value to a bucket number */
 746 static inline uint32
 747 calc_bucket(HASHHDR *hctl, uint32 hash_val)
 748 {
 749         uint32          bucket;
 750
 751         bucket = hash_val & hctl->high_mask;
 752         if (bucket > hctl->max_bucket)
 753                 bucket = bucket & hctl->low_mask;
 754
 755         return bucket;
 756 }
 757
 758 /*
 759  * hash_search -- look up key in table and perform action
 760  * hash_search_with_hash_value -- same, with key's hash value already computed
 761  *
 762  * action is one of:
 763  *              HASH_FIND: look up key in table
 764  *              HASH_ENTER: look up key in table, creating entry if not present
 765  *              HASH_ENTER_NULL: same, but return NULL if out of memory
 766  *              HASH_REMOVE: look up key in table, remove entry if present
 767  *
 768  * Return value is a pointer to the element found/entered/removed if any,
 769  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
 770  * the result is a dangling pointer that shouldn't be dereferenced!)
 771  *
 772  * HASH_ENTER will normally ereport a generic "out of memory" error if
 773  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
 774  * the same except it will return NULL if out of memory.  Note that
 775  * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
 776  * since palloc internally ereports on out-of-memory.
 777  *
 778  * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
 779  * existing entry in the table, FALSE otherwise.  This is needed in the
 780  * HASH_ENTER case, but is redundant with the return value otherwise.
 781  *
 782  * For hash_search_with_hash_value, the hashvalue parameter must have been
 783  * calculated with get_hash_value().
 784  */
 785 void *
 786 hash_search(HTAB *hashp,
 787                         const void *keyPtr,
 788                         HASHACTION action,
 789                         bool *foundPtr)
 790 {
 791         return hash_search_with_hash_value(hashp,
 792                                                                            keyPtr,
 793                                                                            hashp->hash(keyPtr, hashp->keysize),
 794                                                                            action,
 795                                                                            foundPtr);
 796 }
 797
 798 void *
 799 hash_search_with_hash_value(HTAB *hashp,
 800                                                         const void *keyPtr,
 801                                                         uint32 hashvalue,
 802                                                         HASHACTION action,
 803                                                         bool *foundPtr)
 804 {
 805         HASHHDR    *hctl = hashp->hctl;
 806         Size            keysize;
 807         uint32          bucket;
 808         long            segment_num;
 809         long            segment_ndx;
 810         HASHSEGMENT segp;
 811         HASHBUCKET      currBucket;
 812         HASHBUCKET *prevBucketPtr;
 813         HashCompareFunc match;
 814
 815 #if HASH_STATISTICS
 816         hash_accesses++;
 817         hctl->accesses++;
 818 #endif
 819
 820         /*
 821          * Do the initial lookup
 822          */
 823         bucket = calc_bucket(hctl, hashvalue);
 824
 825         segment_num = bucket >> hashp->sshift;
 826         segment_ndx = MOD(bucket, hashp->ssize);
 827
 828         segp = hashp->dir[segment_num];
 829
 830         if (segp == NULL)
 831                 hash_corrupted(hashp);
 832
 833         prevBucketPtr = &segp[segment_ndx];
 834         currBucket = *prevBucketPtr;
 835
 836         /*
 837          * Follow collision chain looking for matching key
 838          */
 839         match = hashp->match;           /* save one fetch in inner loop */
 840         keysize = hashp->keysize;       /* ditto */
 841
 842         while (currBucket != NULL)
 843         {
 844                 if (currBucket->hashvalue == hashvalue &&
 845                         match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
 846                         break;
 847                 prevBucketPtr = &(currBucket->link);
 848                 currBucket = *prevBucketPtr;
 849 #if HASH_STATISTICS
 850                 hash_collisions++;
 851                 hctl->collisions++;
 852 #endif
 853         }
 854
 855         if (foundPtr)
 856                 *foundPtr = (bool) (currBucket != NULL);
 857
 858         /*
 859          * OK, now what?
 860          */
 861         switch (action)
 862         {
 863                 case HASH_FIND:
 864                         if (currBucket != NULL)
 865                                 return (void *) ELEMENTKEY(currBucket);
 866                         return NULL;
 867
 868                 case HASH_REMOVE:
 869                         if (currBucket != NULL)
 870                         {
 871                                 /* use volatile pointer to prevent code rearrangement */
 872                                 volatile HASHHDR *hctlv = hctl;
 873
 874                                 /* if partitioned, must lock to touch nentries and freeList */
 875                                 if (IS_PARTITIONED(hctlv))
 876                                         SpinLockAcquire(&hctlv->mutex);
 877
 878                                 Assert(hctlv->nentries > 0);
 879                                 hctlv->nentries--;
 880
 881                                 /* remove record from hash bucket's chain. */
 882                                 *prevBucketPtr = currBucket->link;
 883
 884                                 /* add the record to the freelist for this table.  */
 885                                 currBucket->link = hctlv->freeList;
 886                                 hctlv->freeList = currBucket;
 887
 888                                 if (IS_PARTITIONED(hctlv))
 889                                         SpinLockRelease(&hctlv->mutex);
 890
 891                                 /*
 892                                  * better hope the caller is synchronizing access to this
 893                                  * element, because someone else is going to reuse it the next
 894                                  * time something is added to the table
 895                                  */
 896                                 return (void *) ELEMENTKEY(currBucket);
 897                         }
 898                         return NULL;
 899
 900                 case HASH_ENTER_NULL:
 901                         /* ENTER_NULL does not work with palloc-based allocator */
 902                         Assert(hashp->alloc != DynaHashAlloc);
 903                         /* FALL THRU */
 904
 905                 case HASH_ENTER:
 906                         /* Return existing element if found, else create one */
 907                         if (currBucket != NULL)
 908                                 return (void *) ELEMENTKEY(currBucket);
 909
 910                         /* disallow inserts if frozen */
 911                         if (hashp->frozen)
 912                                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
 913                                          hashp->tabname);
 914
 915                         currBucket = get_hash_entry(hashp);
 916                         if (currBucket == NULL)
 917                         {
 918                                 /* out of memory */
 919                                 if (action == HASH_ENTER_NULL)
 920                                         return NULL;
 921                                 /* report a generic message */
 922                                 if (hashp->isshared)
 923                                         ereport(ERROR,
 924                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 925                                                          errmsg("out of shared memory")));
 926                                 else
 927                                         ereport(ERROR,
 928                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 929                                                          errmsg("out of memory")));
 930                         }
 931
 932                         /* link into hashbucket chain */
 933                         *prevBucketPtr = currBucket;
 934                         currBucket->link = NULL;
 935
 936                         /* copy key into record */
 937                         currBucket->hashvalue = hashvalue;
 938                         hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
 939
 940                         /* caller is expected to fill the data field on return */
 941
 942                         /*
 943                          * Check if it is time to split a bucket.  Can't split if running
 944                          * in partitioned mode, nor if table is the subject of any active
 945                          * hash_seq_search scans.  Strange order of these tests is to try
 946                          * to check cheaper conditions first.
 947                          */
 948                         if (!IS_PARTITIONED(hctl) &&
 949                         hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
 950                                 !has_seq_scans(hashp))
 951                         {
 952                                 /*
 953                                  * NOTE: failure to expand table is not a fatal error, it just
 954                                  * means we have to run at higher fill factor than we wanted.
 955                                  */
 956                                 expand_table(hashp);
 957                         }
 958
 959                         return (void *) ELEMENTKEY(currBucket);
 960         }
 961
 962         elog(ERROR, "unrecognized hash action code: %d", (int) action);
 963
 964         return NULL;                            /* keep compiler quiet */
 965 }
 966
 967 /*
 968  * create a new entry if possible
 969  */
 970 static HASHBUCKET
 971 get_hash_entry(HTAB *hashp)
 972 {
 973         /* use volatile pointer to prevent code rearrangement */
 974         volatile HASHHDR *hctlv = hashp->hctl;
 975         HASHBUCKET      newElement;
 976
 977         for (;;)
 978         {
 979                 /* if partitioned, must lock to touch nentries and freeList */
 980                 if (IS_PARTITIONED(hctlv))
 981                         SpinLockAcquire(&hctlv->mutex);
 982
 983                 /* try to get an entry from the freelist */
 984                 newElement = hctlv->freeList;
 985                 if (newElement != NULL)
 986                         break;
 987
 988                 /* no free elements.  allocate another chunk of buckets */
 989                 if (IS_PARTITIONED(hctlv))
 990                         SpinLockRelease(&hctlv->mutex);
 991
 992                 if (!element_alloc(hashp, hctlv->nelem_alloc))
 993                 {
 994                         /* out of memory */
 995                         return NULL;
 996                 }
 997         }
 998
 999         /* remove entry from freelist, bump nentries */
1000         hctlv->freeList = newElement->link;
1001         hctlv->nentries++;
1002
1003         if (IS_PARTITIONED(hctlv))
1004                 SpinLockRelease(&hctlv->mutex);
1005
1006         return newElement;
1007 }
1008
1009 /*
1010  * hash_get_num_entries -- get the number of entries in a hashtable
1011  */
1012 long
1013 hash_get_num_entries(HTAB *hashp)
1014 {
1015         /*
1016          * We currently don't bother with the mutex; it's only sensible to call
1017          * this function if you've got lock on all partitions of the table.
1018          */
1019         return hashp->hctl->nentries;
1020 }
1021
1022 /*
1023  * hash_seq_init/_search/_term
1024  *                      Sequentially search through hash table and return
1025  *                      all the elements one by one, return NULL when no more.
1026  *
1027  * hash_seq_term should be called if and only if the scan is abandoned before
1028  * completion; if hash_seq_search returns NULL then it has already done the
1029  * end-of-scan cleanup.
1030  *
1031  * NOTE: caller may delete the returned element before continuing the scan.
1032  * However, deleting any other element while the scan is in progress is
1033  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
1034  * if elements are added to the table while the scan is in progress, it is
1035  * unspecified whether they will be visited by the scan or not.
1036  *
1037  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1038  * worry about hash_seq_term cleanup, if the hashtable is first locked against
1039  * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
1040  * wherein it is inconvenient to track whether a scan is still open, and
1041  * there's no possibility of further insertions after readout has begun.
1042  *
1043  * NOTE: to use this with a partitioned hashtable, caller had better hold
1044  * at least shared lock on all partitions of the table throughout the scan!
1045  * We can cope with insertions or deletions by our own backend, but *not*
1046  * with concurrent insertions or deletions by another.
1047  */
1048 void
1049 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1050 {
1051         status->hashp = hashp;
1052         status->curBucket = 0;
1053         status->curEntry = NULL;
1054         if (!hashp->frozen)
1055                 register_seq_scan(hashp);
1056 }
1057
1058 void *
1059 hash_seq_search(HASH_SEQ_STATUS *status)
1060 {
1061         HTAB       *hashp;
1062         HASHHDR    *hctl;
1063         uint32          max_bucket;
1064         long            ssize;
1065         long            segment_num;
1066         long            segment_ndx;
1067         HASHSEGMENT segp;
1068         uint32          curBucket;
1069         HASHELEMENT *curElem;
1070
1071         if ((curElem = status->curEntry) != NULL)
1072         {
1073                 /* Continuing scan of curBucket... */
1074                 status->curEntry = curElem->link;
1075                 if (status->curEntry == NULL)   /* end of this bucket */
1076                         ++status->curBucket;
1077                 return (void *) ELEMENTKEY(curElem);
1078         }
1079
1080         /*
1081          * Search for next nonempty bucket starting at curBucket.
1082          */
1083         curBucket = status->curBucket;
1084         hashp = status->hashp;
1085         hctl = hashp->hctl;
1086         ssize = hashp->ssize;
1087         max_bucket = hctl->max_bucket;
1088
1089         if (curBucket > max_bucket)
1090         {
1091                 hash_seq_term(status);
1092                 return NULL;                    /* search is done */
1093         }
1094
1095         /*
1096          * first find the right segment in the table directory.
1097          */
1098         segment_num = curBucket >> hashp->sshift;
1099         segment_ndx = MOD(curBucket, ssize);
1100
1101         segp = hashp->dir[segment_num];
1102
1103         /*
1104          * Pick up the first item in this bucket's chain.  If chain is not empty
1105          * we can begin searching it.  Otherwise we have to advance to find the
1106          * next nonempty bucket.  We try to optimize that case since searching a
1107          * near-empty hashtable has to iterate this loop a lot.
1108          */
1109         while ((curElem = segp[segment_ndx]) == NULL)
1110         {
1111                 /* empty bucket, advance to next */
1112                 if (++curBucket > max_bucket)
1113                 {
1114                         status->curBucket = curBucket;
1115                         hash_seq_term(status);
1116                         return NULL;            /* search is done */
1117                 }
1118                 if (++segment_ndx >= ssize)
1119                 {
1120                         segment_num++;
1121                         segment_ndx = 0;
1122                         segp = hashp->dir[segment_num];
1123                 }
1124         }
1125
1126         /* Begin scan of curBucket... */
1127         status->curEntry = curElem->link;
1128         if (status->curEntry == NULL)           /* end of this bucket */
1129                 ++curBucket;
1130         status->curBucket = curBucket;
1131         return (void *) ELEMENTKEY(curElem);
1132 }
1133
1134 void
1135 hash_seq_term(HASH_SEQ_STATUS *status)
1136 {
1137         if (!status->hashp->frozen)
1138                 deregister_seq_scan(status->hashp);
1139 }
1140
1141 /*
1142  * hash_freeze
1143  *                      Freeze a hashtable against future insertions (deletions are
1144  *                      still allowed)
1145  *
1146  * The reason for doing this is that by preventing any more bucket splits,
1147  * we no longer need to worry about registering hash_seq_search scans,
1148  * and thus caller need not be careful about ensuring hash_seq_term gets
1149  * called at the right times.
1150  *
1151  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1152  * with active scans (since hash_seq_term would then do the wrong thing).
1153  */
1154 void
1155 hash_freeze(HTAB *hashp)
1156 {
1157         if (hashp->isshared)
1158                 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1159         if (!hashp->frozen && has_seq_scans(hashp))
1160                 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1161                          hashp->tabname);
1162         hashp->frozen = true;
1163 }
1164
1165
1166 /********************************* UTILITIES ************************/
1167
1168 /*
1169  * Expand the table by adding one more hash bucket.
1170  */
1171 static bool
1172 expand_table(HTAB *hashp)
1173 {
1174         HASHHDR    *hctl = hashp->hctl;
1175         HASHSEGMENT old_seg,
1176                                 new_seg;
1177         long            old_bucket,
1178                                 new_bucket;
1179         long            new_segnum,
1180                                 new_segndx;
1181         long            old_segnum,
1182                                 old_segndx;
1183         HASHBUCKET *oldlink,
1184                            *newlink;
1185         HASHBUCKET      currElement,
1186                                 nextElement;
1187
1188         Assert(!IS_PARTITIONED(hctl));
1189
1190 #ifdef HASH_STATISTICS
1191         hash_expansions++;
1192 #endif
1193
1194         new_bucket = hctl->max_bucket + 1;
1195         new_segnum = new_bucket >> hashp->sshift;
1196         new_segndx = MOD(new_bucket, hashp->ssize);
1197
1198         if (new_segnum >= hctl->nsegs)
1199         {
1200                 /* Allocate new segment if necessary -- could fail if dir full */
1201                 if (new_segnum >= hctl->dsize)
1202                         if (!dir_realloc(hashp))
1203                                 return false;
1204                 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1205                         return false;
1206                 hctl->nsegs++;
1207         }
1208
1209         /* OK, we created a new bucket */
1210         hctl->max_bucket++;
1211
1212         /*
1213          * *Before* changing masks, find old bucket corresponding to same hash
1214          * values; values in that bucket may need to be relocated to new bucket.
1215          * Note that new_bucket is certainly larger than low_mask at this point,
1216          * so we can skip the first step of the regular hash mask calc.
1217          */
1218         old_bucket = (new_bucket & hctl->low_mask);
1219
1220         /*
1221          * If we crossed a power of 2, readjust masks.
1222          */
1223         if ((uint32) new_bucket > hctl->high_mask)
1224         {
1225                 hctl->low_mask = hctl->high_mask;
1226                 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1227         }
1228
1229         /*
1230          * Relocate records to the new bucket.  NOTE: because of the way the hash
1231          * masking is done in calc_bucket, only one old bucket can need to be
1232          * split at this point.  With a different way of reducing the hash value,
1233          * that might not be true!
1234          */
1235         old_segnum = old_bucket >> hashp->sshift;
1236         old_segndx = MOD(old_bucket, hashp->ssize);
1237
1238         old_seg = hashp->dir[old_segnum];
1239         new_seg = hashp->dir[new_segnum];
1240
1241         oldlink = &old_seg[old_segndx];
1242         newlink = &new_seg[new_segndx];
1243
1244         for (currElement = *oldlink;
1245                  currElement != NULL;
1246                  currElement = nextElement)
1247         {
1248                 nextElement = currElement->link;
1249                 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1250                 {
1251                         *oldlink = currElement;
1252                         oldlink = &currElement->link;
1253                 }
1254                 else
1255                 {
1256                         *newlink = currElement;
1257                         newlink = &currElement->link;
1258                 }
1259         }
1260         /* don't forget to terminate the rebuilt hash chains... */
1261         *oldlink = NULL;
1262         *newlink = NULL;
1263
1264         return true;
1265 }
1266
1267
1268 static bool
1269 dir_realloc(HTAB *hashp)
1270 {
1271         HASHSEGMENT *p;
1272         HASHSEGMENT *old_p;
1273         long            new_dsize;
1274         long            old_dirsize;
1275         long            new_dirsize;
1276
1277         if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1278                 return false;
1279
1280         /* Reallocate directory */
1281         new_dsize = hashp->hctl->dsize << 1;
1282         old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1283         new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1284
1285         old_p = hashp->dir;
1286         CurrentDynaHashCxt = hashp->hcxt;
1287         p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1288
1289         if (p != NULL)
1290         {
1291                 memcpy(p, old_p, old_dirsize);
1292                 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1293                 hashp->dir = p;
1294                 hashp->hctl->dsize = new_dsize;
1295
1296                 /* XXX assume the allocator is palloc, so we know how to free */
1297                 Assert(hashp->alloc == DynaHashAlloc);
1298                 pfree(old_p);
1299
1300                 return true;
1301         }
1302
1303         return false;
1304 }
1305
1306
1307 static HASHSEGMENT
1308 seg_alloc(HTAB *hashp)
1309 {
1310         HASHSEGMENT segp;
1311
1312         CurrentDynaHashCxt = hashp->hcxt;
1313         segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1314
1315         if (!segp)
1316                 return NULL;
1317
1318         MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1319
1320         return segp;
1321 }
1322
1323 /*
1324  * allocate some new elements and link them into the free list
1325  */
1326 static bool
1327 element_alloc(HTAB *hashp, int nelem)
1328 {
1329         /* use volatile pointer to prevent code rearrangement */
1330         volatile HASHHDR *hctlv = hashp->hctl;
1331         Size            elementSize;
1332         HASHELEMENT *firstElement;
1333         HASHELEMENT *tmpElement;
1334         HASHELEMENT *prevElement;
1335         int                     i;
1336
1337         /* Each element has a HASHELEMENT header plus user data. */
1338         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
1339
1340         CurrentDynaHashCxt = hashp->hcxt;
1341         firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1342
1343         if (!firstElement)
1344                 return false;
1345
1346         /* prepare to link all the new entries into the freelist */
1347         prevElement = NULL;
1348         tmpElement = firstElement;
1349         for (i = 0; i < nelem; i++)
1350         {
1351                 tmpElement->link = prevElement;
1352                 prevElement = tmpElement;
1353                 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1354         }
1355
1356         /* if partitioned, must lock to touch freeList */
1357         if (IS_PARTITIONED(hctlv))
1358                 SpinLockAcquire(&hctlv->mutex);
1359
1360         /* freelist could be nonempty if two backends did this concurrently */
1361         firstElement->link = hctlv->freeList;
1362         hctlv->freeList = prevElement;
1363
1364         if (IS_PARTITIONED(hctlv))
1365                 SpinLockRelease(&hctlv->mutex);
1366
1367         return true;
1368 }
1369
1370 /* complain when we have detected a corrupted hashtable */
1371 static void
1372 hash_corrupted(HTAB *hashp)
1373 {
1374         /*
1375          * If the corruption is in a shared hashtable, we'd better force a
1376          * systemwide restart.  Otherwise, just shut down this one backend.
1377          */
1378         if (hashp->isshared)
1379                 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1380         else
1381                 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1382 }
1383
1384 /* calculate ceil(log base 2) of num */
1385 int
1386 my_log2(long num)
1387 {
1388         int                     i;
1389         long            limit;
1390
1391         for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1392                 ;
1393         return i;
1394 }
1395
1396
1397 /************************* SEQ SCAN TRACKING ************************/
1398
1399 /*
1400  * We track active hash_seq_search scans here.  The need for this mechanism
1401  * comes from the fact that a scan will get confused if a bucket split occurs
1402  * while it's in progress: it might visit entries twice, or even miss some
1403  * entirely (if it's partway through the same bucket that splits).  Hence
1404  * we want to inhibit bucket splits if there are any active scans on the
1405  * table being inserted into.  This is a fairly rare case in current usage,
1406  * so just postponing the split until the next insertion seems sufficient.
1407  *
1408  * Given present usages of the function, only a few scans are likely to be
1409  * open concurrently; so a finite-size stack of open scans seems sufficient,
1410  * and we don't worry that linear search is too slow.  Note that we do
1411  * allow multiple scans of the same hashtable to be open concurrently.
1412  *
1413  * This mechanism can support concurrent scan and insertion in a shared
1414  * hashtable if it's the same backend doing both.  It would fail otherwise,
1415  * but locking reasons seem to preclude any such scenario anyway, so we don't
1416  * worry.
1417  *
1418  * This arrangement is reasonably robust if a transient hashtable is deleted
1419  * without notifying us.  The absolute worst case is we might inhibit splits
1420  * in another table created later at exactly the same address.  We will give
1421  * a warning at transaction end for reference leaks, so any bugs leading to
1422  * lack of notification should be easy to catch.
1423  */
1424
1425 #define MAX_SEQ_SCANS 100
1426
1427 static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
1428 static int      seq_scan_level[MAX_SEQ_SCANS];          /* subtransaction nest level */
1429 static int      num_seq_scans = 0;
1430
1431
1432 /* Register a table as having an active hash_seq_search scan */
1433 static void
1434 register_seq_scan(HTAB *hashp)
1435 {
1436         if (num_seq_scans >= MAX_SEQ_SCANS)
1437                 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1438                          hashp->tabname);
1439         seq_scan_tables[num_seq_scans] = hashp;
1440         seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1441         num_seq_scans++;
1442 }
1443
1444 /* Deregister an active scan */
1445 static void
1446 deregister_seq_scan(HTAB *hashp)
1447 {
1448         int                     i;
1449
1450         /* Search backward since it's most likely at the stack top */
1451         for (i = num_seq_scans - 1; i >= 0; i--)
1452         {
1453                 if (seq_scan_tables[i] == hashp)
1454                 {
1455                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1456                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1457                         num_seq_scans--;
1458                         return;
1459                 }
1460         }
1461         elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1462                  hashp->tabname);
1463 }
1464
1465 /* Check if a table has any active scan */
1466 static bool
1467 has_seq_scans(HTAB *hashp)
1468 {
1469         int                     i;
1470
1471         for (i = 0; i < num_seq_scans; i++)
1472         {
1473                 if (seq_scan_tables[i] == hashp)
1474                         return true;
1475         }
1476         return false;
1477 }
1478
1479 /* Clean up any open scans at end of transaction */
1480 void
1481 AtEOXact_HashTables(bool isCommit)
1482 {
1483         /*
1484          * During abort cleanup, open scans are expected; just silently clean 'em
1485          * out.  An open scan at commit means someone forgot a hash_seq_term()
1486          * call, so complain.
1487          *
1488          * Note: it's tempting to try to print the tabname here, but refrain for
1489          * fear of touching deallocated memory.  This isn't a user-facing message
1490          * anyway, so it needn't be pretty.
1491          */
1492         if (isCommit)
1493         {
1494                 int                     i;
1495
1496                 for (i = 0; i < num_seq_scans; i++)
1497                 {
1498                         elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1499                                  seq_scan_tables[i]);
1500                 }
1501         }
1502         num_seq_scans = 0;
1503 }
1504
1505 /* Clean up any open scans at end of subtransaction */
1506 void
1507 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1508 {
1509         int                     i;
1510
1511         /*
1512          * Search backward to make cleanup easy.  Note we must check all entries,
1513          * not only those at the end of the array, because deletion technique
1514          * doesn't keep them in order.
1515          */
1516         for (i = num_seq_scans - 1; i >= 0; i--)
1517         {
1518                 if (seq_scan_level[i] >= nestDepth)
1519                 {
1520                         if (isCommit)
1521                                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1522                                          seq_scan_tables[i]);
1523                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1524                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1525                         num_seq_scans--;
1526                 }
1527         }
1528 }