Change NTUP_PER_BUCKET to 1 to improve hash join lookup speed.

author Robert Haas <rhaas@postgresql.org>

Fri, 12 Sep 2014 20:11:58 +0000 (16:11 -0400)

committer Robert Haas <rhaas@postgresql.org>

Fri, 12 Sep 2014 20:18:09 +0000 (16:18 -0400)
author Robert Haas <rhaas@postgresql.org>
Fri, 12 Sep 2014 20:11:58 +0000 (16:11 -0400)
committer Robert Haas <rhaas@postgresql.org>
Fri, 12 Sep 2014 20:18:09 +0000 (16:18 -0400)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c

index 3ef7cfbb5bfad5aa4da83875813acc190e706d6a..b3203ba7fa4deeda22471ab3561f776f9b800e0c 100644 (file)
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -386,7 +386,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
   */
  
  /* Target bucket loading (tuples per bucket) */
-#define NTUP_PER_BUCKET                        10
+#define NTUP_PER_BUCKET                        1
  
  void
  ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
@@ -396,12 +396,13 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
  {
         int                     tupsize;
         double          inner_rel_bytes;
+       long            bucket_bytes;
         long            hash_table_bytes;
         long            skew_table_bytes;
         long            max_pointers;
-       int                     nbatch;
+       int                     nbatch = 1;
         int                     nbuckets;
-       int                     i;
+       double          dbuckets;
  
         /* Force a plausible relation size if no info */
         if (ntuples <= 0.0)
@@ -460,56 +461,61 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
  
         /*
          * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
-        * memory is filled.  Set nbatch to the smallest power of 2 that appears
-        * sufficient.  The Min() steps limit the results so that the pointer
-        * arrays we'll try to allocate do not exceed work_mem.
+        * memory is filled, assuming a single batch.  The Min() step limits the
+        * results so that the pointer arrays we'll try to allocate do not exceed
+        * work_mem.
          */
         max_pointers = (work_mem * 1024L) / sizeof(void *);
         /* also ensure we avoid integer overflow in nbatch and nbuckets */
         max_pointers = Min(max_pointers, INT_MAX / 2);
+       dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
+       dbuckets = Min(dbuckets, max_pointers);
+       nbuckets = Max((int) dbuckets, 1024);
+       nbuckets = 1 << my_log2(nbuckets);
+       bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
  
-       if (inner_rel_bytes > hash_table_bytes)
+       /*
+        * If there's not enough space to store the projected number of tuples
+        * and the required bucket headers, we will need multiple batches.
+        */
+       if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
         {
                 /* We'll need multiple batches */
                 long            lbuckets;
                 double          dbatch;
                 int                     minbatch;
+               long            bucket_size;
  
-               lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET;
+               /*
+                * Estimate the number of buckets we'll want to have when work_mem
+                * is entirely full.  Each bucket will contain a bucket pointer plus
+                * NTUP_PER_BUCKET tuples, whose projected size already includes
+                * overhead for the hash code, pointer to the next tuple, etc.
+                */
+               bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
+               lbuckets = 1 << my_log2(hash_table_bytes / bucket_size);
                 lbuckets = Min(lbuckets, max_pointers);
                 nbuckets = (int) lbuckets;
+               bucket_bytes = nbuckets * sizeof(HashJoinTuple);
+
+               /*
+                * Buckets are simple pointers to hashjoin tuples, while tupsize
+                * includes the pointer, hash code, and MinimalTupleData.  So buckets
+                * should never really exceed 25% of work_mem (even for
+                * NTUP_PER_BUCKET=1); except maybe * for work_mem values that are
+                * not 2^N bytes, where we might get more * because of doubling.
+                * So let's look for 50% here.
+                */
+               Assert(bucket_bytes <= hash_table_bytes / 2);
  
-               dbatch = ceil(inner_rel_bytes / hash_table_bytes);
+               /* Calculate required number of batches. */
+               dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
                 dbatch = Min(dbatch, max_pointers);
                 minbatch = (int) dbatch;
                 nbatch = 2;
                 while (nbatch < minbatch)
                         nbatch <<= 1;
         }
-       else
-       {
-               /* We expect the hashtable to fit in memory */
-               double          dbuckets;
-
-               dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
-               dbuckets = Min(dbuckets, max_pointers);
-               nbuckets = (int) dbuckets;
-
-               nbatch = 1;
-       }
-
-       /*
-        * Both nbuckets and nbatch must be powers of 2 to make
-        * ExecHashGetBucketAndBatch fast.  We already fixed nbatch; now inflate
-        * nbuckets to the next larger power of 2.  We also force nbuckets to not
-        * be real small, by starting the search at 2^10.  (Note: above we made
-        * sure that nbuckets is not more than INT_MAX / 2, so this loop cannot
-        * overflow, nor can the final shift to recalculate nbuckets.)
-        */
-       i = 10;
-       while ((1 << i) < nbuckets)
-               i++;
-       nbuckets = (1 << i);
  
         *numbuckets = nbuckets;
         *numbatches = nbatch;
@@ -754,7 +760,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
                 hashtable->spaceUsed += hashTupleSize;
                 if (hashtable->spaceUsed > hashtable->spacePeak)
                         hashtable->spacePeak = hashtable->spaceUsed;
-               if (hashtable->spaceUsed > hashtable->spaceAllowed)
+               if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
+                       > hashtable->spaceAllowed)
                         ExecHashIncreaseNumBatches(hashtable);
         }
         else
author	Robert Haas <rhaas@postgresql.org>
	Fri, 12 Sep 2014 20:11:58 +0000 (16:11 -0400)
committer	Robert Haas <rhaas@postgresql.org>
	Fri, 12 Sep 2014 20:18:09 +0000 (16:18 -0400)