Fix array size allocation for HashAggregate hash keys.

author Andrew Gierth <rhodiumtoad@postgresql.org>

Thu, 23 May 2019 14:26:01 +0000 (15:26 +0100)

committer Andrew Gierth <rhodiumtoad@postgresql.org>

Thu, 23 May 2019 14:39:17 +0000 (15:39 +0100)
author Andrew Gierth <rhodiumtoad@postgresql.org>
Thu, 23 May 2019 14:26:01 +0000 (15:26 +0100)
committer Andrew Gierth <rhodiumtoad@postgresql.org>
Thu, 23 May 2019 14:39:17 +0000 (15:39 +0100)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index c39531bd11c547eb15d86ea00bf0d3f5e09981d4..779d8ac1e69e52e233c9b5545f39a706122304ce 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1932,9 +1932,14 @@ build_hash_table(AggState *aggstate)
   * by themselves, and secondly ctids for row-marks.
   *
   * To eliminate duplicates, we build a bitmapset of the needed columns, and
- * then build an array of the columns included in the hashtable.  Note that
- * the array is preserved over ExecReScanAgg, so we allocate it in the
- * per-query context (unlike the hash table itself).
+ * then build an array of the columns included in the hashtable. We might
+ * still have duplicates if the passed-in grpColIdx has them, which can happen
+ * in edge cases from semijoins/distinct; these can't always be removed,
+ * because it's not certain that the duplicate cols will be using the same
+ * hash function.
+ *
+ * Note that the array is preserved over ExecReScanAgg, so we allocate it in
+ * the per-query context (unlike the hash table itself).
   */
  static void
  find_hash_columns(AggState *aggstate)
@@ -1954,6 +1959,7 @@ find_hash_columns(AggState *aggstate)
                 AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
                 List       *hashTlist = NIL;
                 TupleDesc       hashDesc;
+               int                     maxCols;
                 int                     i;
  
                 perhash->largestGrpColIdx = 0;
@@ -1978,15 +1984,24 @@ find_hash_columns(AggState *aggstate)
                                         colnos = bms_del_member(colnos, attnum);
                         }
                 }
-               /* Add in all the grouping columns */
-               for (i = 0; i < perhash->numCols; i++)
-                       colnos = bms_add_member(colnos, grpColIdx[i]);
+
+               /*
+                * Compute maximum number of input columns accounting for possible
+                * duplications in the grpColIdx array, which can happen in some edge
+                * cases where HashAggregate was generated as part of a semijoin or a
+                * DISTINCT.
+                */
+               maxCols = bms_num_members(colnos) + perhash->numCols;
  
                 perhash->hashGrpColIdxInput =
-                       palloc(bms_num_members(colnos) * sizeof(AttrNumber));
+                       palloc(maxCols * sizeof(AttrNumber));
                 perhash->hashGrpColIdxHash =
                         palloc(perhash->numCols * sizeof(AttrNumber));
  
+               /* Add all the grouping columns to colnos */
+               for (i = 0; i < perhash->numCols; i++)
+                       colnos = bms_add_member(colnos, grpColIdx[i]);
+
                 /*
                  * First build mapping for columns directly hashed. These are the
                  * first, because they'll be accessed when computing hash values and
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out

index 305ffa8514763ee81b396f9649e2b3988339183a..4d97a841aea28c1f0115673cb450781041c3b6c1 100644 (file)
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2100,3 +2100,21 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
   ba       |    0 |     1
  (2 rows)
  
+-- Make sure that generation of HashAggregate for uniqification purposes
+-- does not lead to array overflow due to unexpected duplicate hash keys
+-- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
+explain (costs off)
+  select 1 from tenk1
+   where (hundred, thousand) in (select twothousand, twothousand from onek);
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.hundred = onek.twothousand)
+   ->  Seq Scan on tenk1
+         Filter: (hundred = thousand)
+   ->  Hash
+         ->  HashAggregate
+               Group Key: onek.twothousand, onek.twothousand
+               ->  Seq Scan on onek
+(8 rows)
+
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql

index cf21fca14b8e53ca42f4f652cb9b0b00b9ee8c4c..39ef5287557d51ba2b77afe1a594afe61a0e24f2 100644 (file)
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -926,3 +926,10 @@ select v||'a', case v||'a' when 'aa' then 1 else 0 end, count(*)
  select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
    from unnest(array['a','b']) u(v)
   group by v||'a' order by 1;
+
+-- Make sure that generation of HashAggregate for uniqification purposes
+-- does not lead to array overflow due to unexpected duplicate hash keys
+-- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
+explain (costs off)
+  select 1 from tenk1
+   where (hundred, thousand) in (select twothousand, twothousand from onek);
author	Andrew Gierth <rhodiumtoad@postgresql.org>
	Thu, 23 May 2019 14:26:01 +0000 (15:26 +0100)
committer	Andrew Gierth <rhodiumtoad@postgresql.org>
	Thu, 23 May 2019 14:39:17 +0000 (15:39 +0100)
src/backend/executor/nodeAgg.c		patch \| blob \| history
src/test/regress/expected/aggregates.out		patch \| blob \| history
src/test/regress/sql/aggregates.sql		patch \| blob \| history