Rename SortMem and VacuumMem to work_mem and maintenance_work_mem.

[postgresql] / src / backend / executor / nodeAgg.c
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index b45376013f95ede584d303a4a581ff7b9bb82dce..cb0a64c42771beb660c1f30a4c2aec1f39dd1711 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -29,24 +29,23 @@
   *       of course).  A non-strict finalfunc can make its own choice of
   *       what to return for a NULL ending transvalue.
   *
- *       When the transvalue datatype is pass-by-reference, we have to be
- *       careful to ensure that the values survive across tuple cycles yet
- *       are not allowed to accumulate until end of query.  We do this by
- *       "ping-ponging" between two memory contexts; successive calls to the
- *       transfunc are executed in alternate contexts, passing the previous
- *       transvalue that is in the other context.      At the beginning of each
- *       tuple cycle we can reset the current output context to avoid memory
- *       usage growth.  Note: we must use MemoryContextContains() to check
- *       whether the transfunc has perhaps handed us back one of its input
- *       values rather than a freshly palloc'd value; if so, we copy the value
- *       to the context we want it in.
+ *       We compute aggregate input expressions and run the transition functions
+ *       in a temporary econtext (aggstate->tmpcontext).  This is reset at
+ *       least once per input tuple, so when the transvalue datatype is
+ *       pass-by-reference, we have to be careful to copy it into a longer-lived
+ *       memory context, and free the prior value to avoid memory leakage.
+ *       We store transvalues in the memory context aggstate->aggcontext,
+ *       which is also used for the hashtable structures in AGG_HASHED mode.
+ *       The node's regular econtext (aggstate->csstate.cstate.cs_ExprContext)
+ *       is used to run finalize functions and compute the output tuple;
+ *       this context can be reset once per output tuple.
   *
   *
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v 1.84 2002/05/17 22:35:12 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeAgg.c,v 1.118 2004/02/03 17:34:02 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -60,6 +59,7 @@
  #include "executor/nodeAgg.h"
  #include "miscadmin.h"
  #include "optimizer/clauses.h"
+#include "parser/parse_agg.h"
  #include "parser/parse_coerce.h"
  #include "parser/parse_expr.h"
  #include "parser/parse_oper.h"
@@ -81,7 +81,8 @@ typedef struct AggStatePerAggData
          * thereafter:
          */
  
-       /* Link to Aggref node this working state is for */
+       /* Links to Aggref expr and state nodes this working state is for */
+       AggrefExprState *aggrefstate;
         Aggref     *aggref;
  
         /* Oids of transfer functions */
@@ -139,8 +140,27 @@ typedef struct AggStatePerAggData
          */
  
         Tuplesortstate *sortstate;      /* sort object, if a DISTINCT agg */
+} AggStatePerAggData;
  
-       Datum           transValue;
+/*
+ * AggStatePerGroupData - per-aggregate-per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In AGG_PLAIN and AGG_SORTED modes, we have a single array of these
+ * structs (pointed to by aggstate->pergroup); we re-use the array for
+ * each input group, if it's AGG_SORTED mode.  In AGG_HASHED mode, the
+ * hash table contains an array of these structs for each tuple group.
+ *
+ * Logically, the sortstate field belongs in this struct, but we do not
+ * keep it here for space reasons: we don't support DISTINCT aggregates
+ * in AGG_HASHED mode, so there's no reason to use up a pointer field
+ * in every entry of the hashtable.
+ */
+typedef struct AggStatePerGroupData
+{
+       Datum           transValue;             /* current transition value */
         bool            transValueIsNull;
  
         bool            noTransValue;   /* true if transValue not set yet */
@@ -153,96 +173,138 @@ typedef struct AggStatePerAggData
          * later input value. Only the first non-NULL input will be
          * auto-substituted.
          */
-} AggStatePerAggData;
+} AggStatePerGroupData;
  
+/*
+ * To implement hashed aggregation, we need a hashtable that stores a
+ * representative tuple and an array of AggStatePerGroup structs for each
+ * distinct set of GROUP BY column values.     We compute the hash key from
+ * the GROUP BY columns.
+ */
+typedef struct AggHashEntryData *AggHashEntry;
  
-static void initialize_aggregate(AggStatePerAgg peraggstate);
-static void advance_transition_function(AggStatePerAgg peraggstate,
+typedef struct AggHashEntryData
+{
+       TupleHashEntryData shared;      /* common header for hash table entries */
+       /* per-aggregate transition status array - must be last! */
+       AggStatePerGroupData pergroup[1];       /* VARIABLE LENGTH ARRAY */
+} AggHashEntryData;                            /* VARIABLE LENGTH STRUCT */
+
+
+static void initialize_aggregates(AggState *aggstate,
+                                         AggStatePerAgg peragg,
+                                         AggStatePerGroup pergroup);
+static void advance_transition_function(AggState *aggstate,
+                                                       AggStatePerAgg peraggstate,
+                                                       AggStatePerGroup pergroupstate,
                                                         Datum newVal, bool isNull);
+static void advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup);
  static void process_sorted_aggregate(AggState *aggstate,
-                                                AggStatePerAgg peraggstate);
-static void finalize_aggregate(AggStatePerAgg peraggstate,
+                                                AggStatePerAgg peraggstate,
+                                                AggStatePerGroup pergroupstate);
+static void finalize_aggregate(AggState *aggstate,
+                                  AggStatePerAgg peraggstate,
+                                  AggStatePerGroup pergroupstate,
                                    Datum *resultVal, bool *resultIsNull);
+static void build_hash_table(AggState *aggstate);
+static AggHashEntry lookup_hash_entry(AggState *aggstate,
+                                 TupleTableSlot *slot);
+static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
+static void agg_fill_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
  static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
  
  
  /*
- * Initialize one aggregate for a new set of input values.
+ * Initialize all aggregates for a new group of input values.
   *
   * When called, CurrentMemoryContext should be the per-query context.
   */
  static void
-initialize_aggregate(AggStatePerAgg peraggstate)
+initialize_aggregates(AggState *aggstate,
+                                         AggStatePerAgg peragg,
+                                         AggStatePerGroup pergroup)
  {
-       Aggref     *aggref = peraggstate->aggref;
+       int                     aggno;
  
-       /*
-        * Start a fresh sort operation for each DISTINCT aggregate.
-        */
-       if (aggref->aggdistinct)
+       for (aggno = 0; aggno < aggstate->numaggs; aggno++)
         {
+               AggStatePerAgg peraggstate = &peragg[aggno];
+               AggStatePerGroup pergroupstate = &pergroup[aggno];
+               Aggref     *aggref = peraggstate->aggref;
+
                 /*
-                * In case of rescan, maybe there could be an uncompleted sort
-                * operation?  Clean it up if so.
+                * Start a fresh sort operation for each DISTINCT aggregate.
                  */
-               if (peraggstate->sortstate)
-                       tuplesort_end(peraggstate->sortstate);
+               if (aggref->aggdistinct)
+               {
+                       /*
+                        * In case of rescan, maybe there could be an uncompleted sort
+                        * operation?  Clean it up if so.
+                        */
+                       if (peraggstate->sortstate)
+                               tuplesort_end(peraggstate->sortstate);
  
-               peraggstate->sortstate =
-                       tuplesort_begin_datum(peraggstate->inputType,
-                                                                 peraggstate->sortOperator,
-                                                                 false);
-       }
+                       peraggstate->sortstate =
+                               tuplesort_begin_datum(peraggstate->inputType,
+                                                                         peraggstate->sortOperator,
+                                                                         work_mem, false);
+               }
  
-       /*
-        * (Re)set transValue to the initial value.
-        *
-        * Note that when the initial value is pass-by-ref, we just reuse it
-        * without copying for each group.      Hence, transition function had
-        * better not scribble on its input, or it will fail for GROUP BY!
-        */
-       peraggstate->transValue = peraggstate->initValue;
-       peraggstate->transValueIsNull = peraggstate->initValueIsNull;
+               /*
+                * (Re)set transValue to the initial value.
+                *
+                * Note that when the initial value is pass-by-ref, we must copy it
+                * (into the aggcontext) since we will pfree the transValue later.
+                */
+               if (peraggstate->initValueIsNull)
+                       pergroupstate->transValue = peraggstate->initValue;
+               else
+               {
+                       MemoryContext oldContext;
  
-       /*
-        * If the initial value for the transition state doesn't exist in the
-        * pg_aggregate table then we will let the first non-NULL value
-        * returned from the outer procNode become the initial value. (This is
-        * useful for aggregates like max() and min().)  The noTransValue flag
-        * signals that we still need to do this.
-        */
-       peraggstate->noTransValue = peraggstate->initValueIsNull;
+                       oldContext = MemoryContextSwitchTo(aggstate->aggcontext);
+                       pergroupstate->transValue = datumCopy(peraggstate->initValue,
+                                                                                        peraggstate->transtypeByVal,
+                                                                                         peraggstate->transtypeLen);
+                       MemoryContextSwitchTo(oldContext);
+               }
+               pergroupstate->transValueIsNull = peraggstate->initValueIsNull;
+
+               /*
+                * If the initial value for the transition state doesn't exist in
+                * the pg_aggregate table then we will let the first non-NULL
+                * value returned from the outer procNode become the initial
+                * value. (This is useful for aggregates like max() and min().)
+                * The noTransValue flag signals that we still need to do this.
+                */
+               pergroupstate->noTransValue = peraggstate->initValueIsNull;
+       }
  }
  
  /*
   * Given a new input value, advance the transition function of an aggregate.
   *
- * When called, CurrentMemoryContext should be the context we want the
- * transition function result to be delivered into on this cycle.
+ * It doesn't matter which memory context this is called in.
   */
  static void
-advance_transition_function(AggStatePerAgg peraggstate,
+advance_transition_function(AggState *aggstate,
+                                                       AggStatePerAgg peraggstate,
+                                                       AggStatePerGroup pergroupstate,
                                                         Datum newVal, bool isNull)
  {
         FunctionCallInfoData fcinfo;
+       MemoryContext oldContext;
  
         if (peraggstate->transfn.fn_strict)
         {
+               /*
+                * For a strict transfn, nothing happens at a NULL input tuple; we
+                * just keep the prior transValue.
+                */
                 if (isNull)
-               {
-                       /*
-                        * For a strict transfn, nothing happens at a NULL input
-                        * tuple; we just keep the prior transValue.  However, if the
-                        * transtype is pass-by-ref, we have to copy it into the new
-                        * context because the old one is going to get reset.
-                        */
-                       if (!peraggstate->transValueIsNull)
-                               peraggstate->transValue = datumCopy(peraggstate->transValue,
-                                                                                        peraggstate->transtypeByVal,
-                                                                                         peraggstate->transtypeLen);
                         return;
-               }
-               if (peraggstate->noTransValue)
+               if (pergroupstate->noTransValue)
                 {
                         /*
                          * transValue has not been initialized. This is the first
@@ -251,18 +313,20 @@ advance_transition_function(AggStatePerAgg peraggstate,
                          * is binary-compatible with its transtype, so straight copy
                          * here is OK.)
                          *
-                        * We had better copy the datum if it is pass-by-ref, since the
-                        * given pointer may be pointing into a scan tuple that will
-                        * be freed on the next iteration of the scan.
+                        * We must copy the datum into aggcontext if it is pass-by-ref.
+                        * We do not need to pfree the old transValue, since it's
+                        * NULL.
                          */
-                       peraggstate->transValue = datumCopy(newVal,
+                       oldContext = MemoryContextSwitchTo(aggstate->aggcontext);
+                       pergroupstate->transValue = datumCopy(newVal,
                                                                                          peraggstate->transtypeByVal,
                                                                                           peraggstate->transtypeLen);
-                       peraggstate->transValueIsNull = false;
-                       peraggstate->noTransValue = false;
+                       pergroupstate->transValueIsNull = false;
+                       pergroupstate->noTransValue = false;
+                       MemoryContextSwitchTo(oldContext);
                         return;
                 }
-               if (peraggstate->transValueIsNull)
+               if (pergroupstate->transValueIsNull)
                 {
                         /*
                          * Don't call a strict function with NULL inputs.  Note it is
@@ -275,32 +339,94 @@ advance_transition_function(AggStatePerAgg peraggstate,
                 }
         }
  
-       /* OK to call the transition function */
-       MemSet(&fcinfo, 0, sizeof(fcinfo));
+       /* We run the transition functions in per-input-tuple memory context */
+       oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+       /*
+        * OK to call the transition function
+        *
+        * This is heavily-used code, so manually zero just the necessary fields
+        * instead of using MemSet().  Compare FunctionCall2().
+        */
+
+       /* MemSet(&fcinfo, 0, sizeof(fcinfo)); */
+       fcinfo.context = NULL;
+       fcinfo.resultinfo = NULL;
+       fcinfo.isnull = false;
+
         fcinfo.flinfo = &peraggstate->transfn;
         fcinfo.nargs = 2;
-       fcinfo.arg[0] = peraggstate->transValue;
-       fcinfo.argnull[0] = peraggstate->transValueIsNull;
+       fcinfo.arg[0] = pergroupstate->transValue;
+       fcinfo.argnull[0] = pergroupstate->transValueIsNull;
         fcinfo.arg[1] = newVal;
         fcinfo.argnull[1] = isNull;
  
         newVal = FunctionCallInvoke(&fcinfo);
  
         /*
-        * If the transition function was uncooperative, it may have given us
-        * a pass-by-ref result that points at the scan tuple or the
-        * prior-cycle working memory.  Copy it into the active context if it
-        * doesn't look right.
+        * If pass-by-ref datatype, must copy the new value into aggcontext
+        * and pfree the prior transValue.      But if transfn returned a pointer
+        * to its first input, we don't need to do anything.
          */
-       if (!peraggstate->transtypeByVal && !fcinfo.isnull &&
-               !MemoryContextContains(CurrentMemoryContext,
-                                                          DatumGetPointer(newVal)))
-               newVal = datumCopy(newVal,
-                                                  peraggstate->transtypeByVal,
-                                                  peraggstate->transtypeLen);
+       if (!peraggstate->transtypeByVal &&
+       DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
+       {
+               if (!fcinfo.isnull)
+               {
+                       MemoryContextSwitchTo(aggstate->aggcontext);
+                       newVal = datumCopy(newVal,
+                                                          peraggstate->transtypeByVal,
+                                                          peraggstate->transtypeLen);
+               }
+               if (!pergroupstate->transValueIsNull)
+                       pfree(DatumGetPointer(pergroupstate->transValue));
+       }
  
-       peraggstate->transValue = newVal;
-       peraggstate->transValueIsNull = fcinfo.isnull;
+       pergroupstate->transValue = newVal;
+       pergroupstate->transValueIsNull = fcinfo.isnull;
+
+       MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Advance all the aggregates for one input tuple.     The input tuple
+ * has been stored in tmpcontext->ecxt_scantuple, so that it is accessible
+ * to ExecEvalExpr.  pergroup is the array of per-group structs to use
+ * (this might be in a hashtable entry).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
+{
+       ExprContext *econtext = aggstate->tmpcontext;
+       int                     aggno;
+
+       for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+       {
+               AggStatePerAgg peraggstate = &aggstate->peragg[aggno];
+               AggStatePerGroup pergroupstate = &pergroup[aggno];
+               AggrefExprState *aggrefstate = peraggstate->aggrefstate;
+               Aggref     *aggref = peraggstate->aggref;
+               Datum           newVal;
+               bool            isNull;
+
+               newVal = ExecEvalExprSwitchContext(aggrefstate->target, econtext,
+                                                                                  &isNull, NULL);
+
+               if (aggref->aggdistinct)
+               {
+                       /* in DISTINCT mode, we may ignore nulls */
+                       if (isNull)
+                               continue;
+                       tuplesort_putdatum(peraggstate->sortstate, newVal, isNull);
+               }
+               else
+               {
+                       advance_transition_function(aggstate, peraggstate, pergroupstate,
+                                                                               newVal, isNull);
+               }
+       }
  }
  
  /*
@@ -313,10 +439,12 @@ advance_transition_function(AggStatePerAgg peraggstate,
   */
  static void
  process_sorted_aggregate(AggState *aggstate,
-                                                AggStatePerAgg peraggstate)
+                                                AggStatePerAgg peraggstate,
+                                                AggStatePerGroup pergroupstate)
  {
         Datum           oldVal = (Datum) 0;
         bool            haveOldVal = false;
+       MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
         MemoryContext oldContext;
         Datum           newVal;
         bool            isNull;
@@ -340,12 +468,11 @@ process_sorted_aggregate(AggState *aggstate,
                         continue;
  
                 /*
-                * Clear and select the current working context for evaluation of
-                * the equality function and transition function.
+                * Clear and select the working context for evaluation of the
+                * equality function and transition function.
                  */
-               MemoryContextReset(aggstate->agg_cxt[aggstate->which_cxt]);
-               oldContext =
-                       MemoryContextSwitchTo(aggstate->agg_cxt[aggstate->which_cxt]);
+               MemoryContextReset(workcontext);
+               oldContext = MemoryContextSwitchTo(workcontext);
  
                 if (haveOldVal &&
                         DatumGetBool(FunctionCall2(&peraggstate->equalfn,
@@ -354,24 +481,15 @@ process_sorted_aggregate(AggState *aggstate,
                         /* equal to prior, so forget this one */
                         if (!peraggstate->inputtypeByVal)
                                 pfree(DatumGetPointer(newVal));
-
-                       /*
-                        * note we do NOT flip contexts in this case, so no need to
-                        * copy prior transValue to other context.
-                        */
                 }
                 else
                 {
-                       advance_transition_function(peraggstate, newVal, false);
-
-                       /*
-                        * Make the other context current so that this transition
-                        * result is preserved.
-                        */
-                       aggstate->which_cxt = 1 - aggstate->which_cxt;
+                       advance_transition_function(aggstate, peraggstate, pergroupstate,
+                                                                               newVal, false);
                         /* forget the old value, if any */
                         if (haveOldVal && !peraggstate->inputtypeByVal)
                                 pfree(DatumGetPointer(oldVal));
+                       /* and remember the new one for subsequent equality checks */
                         oldVal = newVal;
                         haveOldVal = true;
                 }
@@ -389,13 +507,19 @@ process_sorted_aggregate(AggState *aggstate,
  /*
   * Compute the final value of one aggregate.
   *
- * When called, CurrentMemoryContext should be the context where we want
- * final values delivered (ie, the per-output-tuple expression context).
+ * The finalfunction will be run, and the result delivered, in the
+ * output-tuple context; caller's CurrentMemoryContext does not matter.
   */
  static void
-finalize_aggregate(AggStatePerAgg peraggstate,
+finalize_aggregate(AggState *aggstate,
+                                  AggStatePerAgg peraggstate,
+                                  AggStatePerGroup pergroupstate,
                                    Datum *resultVal, bool *resultIsNull)
  {
+       MemoryContext oldContext;
+
+       oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
         /*
          * Apply the agg's finalfn if one is provided, else return transValue.
          */
@@ -406,9 +530,9 @@ finalize_aggregate(AggStatePerAgg peraggstate,
                 MemSet(&fcinfo, 0, sizeof(fcinfo));
                 fcinfo.flinfo = &peraggstate->finalfn;
                 fcinfo.nargs = 1;
-               fcinfo.arg[0] = peraggstate->transValue;
-               fcinfo.argnull[0] = peraggstate->transValueIsNull;
-               if (fcinfo.flinfo->fn_strict && peraggstate->transValueIsNull)
+               fcinfo.arg[0] = pergroupstate->transValue;
+               fcinfo.argnull[0] = pergroupstate->transValueIsNull;
+               if (fcinfo.flinfo->fn_strict && pergroupstate->transValueIsNull)
                 {
                         /* don't call a strict function with NULL inputs */
                         *resultVal = (Datum) 0;
@@ -422,8 +546,8 @@ finalize_aggregate(AggStatePerAgg peraggstate,
         }
         else
         {
-               *resultVal = peraggstate->transValue;
-               *resultIsNull = peraggstate->transValueIsNull;
+               *resultVal = pergroupstate->transValue;
+               *resultIsNull = pergroupstate->transValueIsNull;
         }
  
         /*
@@ -435,59 +559,130 @@ finalize_aggregate(AggStatePerAgg peraggstate,
                 *resultVal = datumCopy(*resultVal,
                                                            peraggstate->resulttypeByVal,
                                                            peraggstate->resulttypeLen);
+
+       MemoryContextSwitchTo(oldContext);
  }
  
+/*
+ * Initialize the hash table to empty.
+ *
+ * The hash table always lives in the aggcontext memory context.
+ */
+static void
+build_hash_table(AggState *aggstate)
+{
+       Agg                *node = (Agg *) aggstate->ss.ps.plan;
+       MemoryContext tmpmem = aggstate->tmpcontext->ecxt_per_tuple_memory;
+       Size            entrysize;
+
+       Assert(node->aggstrategy == AGG_HASHED);
+       Assert(node->numGroups > 0);
+
+       entrysize = sizeof(AggHashEntryData) +
+               (aggstate->numaggs - 1) *sizeof(AggStatePerGroupData);
+
+       aggstate->hashtable = BuildTupleHashTable(node->numCols,
+                                                                                         node->grpColIdx,
+                                                                                         aggstate->eqfunctions,
+                                                                                         aggstate->hashfunctions,
+                                                                                         node->numGroups,
+                                                                                         entrysize,
+                                                                                         aggstate->aggcontext,
+                                                                                         tmpmem);
+}
  
-/* ---------------------------------------
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.
   *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static AggHashEntry
+lookup_hash_entry(AggState *aggstate, TupleTableSlot *slot)
+{
+       AggHashEntry entry;
+       bool            isnew;
+
+       entry = (AggHashEntry) LookupTupleHashEntry(aggstate->hashtable,
+                                                                                               slot,
+                                                                                               &isnew);
+
+       if (isnew)
+       {
+               /* initialize aggregates for new tuple group */
+               initialize_aggregates(aggstate, aggstate->peragg, entry->pergroup);
+       }
+
+       return entry;
+}
+
+/*
   * ExecAgg -
   *
   *       ExecAgg receives tuples from its outer subplan and aggregates over
   *       the appropriate attribute for each aggregate function use (Aggref
   *       node) appearing in the targetlist or qual of the node.  The number
- *       of tuples to aggregate over depends on whether a GROUP BY clause is
- *       present.      We can produce an aggregate result row per group, or just
- *       one for the whole query.      The value of each aggregate is stored in
- *       the expression context to be used when ExecProject evaluates the
- *       result tuple.
- *
- *       If the outer subplan is a Group node, ExecAgg returns as many tuples
- *       as there are groups.
- *
- * ------------------------------------------
+ *       of tuples to aggregate over depends on whether grouped or plain
+ *       aggregation is selected.      In grouped aggregation, we produce a result
+ *       row for each group; in plain aggregation there's a single result row
+ *       for the whole query.  In either case, the value of each aggregate is
+ *       stored in the expression context to be used when ExecProject evaluates
+ *       the result tuple.
   */
  TupleTableSlot *
-ExecAgg(Agg *node)
+ExecAgg(AggState *node)
  {
-       AggState   *aggstate;
-       EState     *estate;
-       Plan       *outerPlan;
+       if (node->agg_done)
+               return NULL;
+
+       if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
+       {
+               if (!node->table_filled)
+                       agg_fill_hash_table(node);
+               return agg_retrieve_hash_table(node);
+       }
+       else
+               return agg_retrieve_direct(node);
+}
+
+/*
+ * ExecAgg for non-hashed case
+ */
+static TupleTableSlot *
+agg_retrieve_direct(AggState *aggstate)
+{
+       Agg                *node = (Agg *) aggstate->ss.ps.plan;
+       PlanState  *outerPlan;
         ExprContext *econtext;
+       ExprContext *tmpcontext;
         ProjectionInfo *projInfo;
         Datum      *aggvalues;
         bool       *aggnulls;
         AggStatePerAgg peragg;
-       MemoryContext oldContext;
+       AggStatePerGroup pergroup;
+       TupleTableSlot *outerslot;
+       TupleTableSlot *firstSlot;
         TupleTableSlot *resultSlot;
-       HeapTuple       inputTuple;
         int                     aggno;
-       bool            isNull;
  
         /*
          * get state info from node
          */
-       aggstate = node->aggstate;
-       estate = node->plan.state;
-       outerPlan = outerPlan(node);
-       econtext = aggstate->csstate.cstate.cs_ExprContext;
+       outerPlan = outerPlanState(aggstate);
+       /* econtext is the per-output-tuple expression context */
+       econtext = aggstate->ss.ps.ps_ExprContext;
         aggvalues = econtext->ecxt_aggvalues;
         aggnulls = econtext->ecxt_aggnulls;
-       projInfo = aggstate->csstate.cstate.cs_ProjInfo;
+       /* tmpcontext is the per-input-tuple expression context */
+       tmpcontext = aggstate->tmpcontext;
+       projInfo = aggstate->ss.ps.ps_ProjInfo;
         peragg = aggstate->peragg;
+       pergroup = aggstate->pergroup;
+       firstSlot = aggstate->ss.ss_ScanTupleSlot;
  
         /*
          * We loop retrieving groups until we find one matching
-        * node->plan.qual
+        * aggstate->ss.ps.qual
          */
         do
         {
@@ -495,208 +690,308 @@ ExecAgg(Agg *node)
                         return NULL;
  
                 /*
-                * Clear the per-output-tuple context for each group
+                * If we don't already have the first tuple of the new group,
+                * fetch it from the outer plan.
                  */
-               MemoryContextReset(aggstate->tup_cxt);
-
-               /*
-                * Initialize working state for a new input tuple group
-                */
-               for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+               if (aggstate->grp_firstTuple == NULL)
                 {
-                       AggStatePerAgg peraggstate = &peragg[aggno];
-
-                       initialize_aggregate(peraggstate);
+                       outerslot = ExecProcNode(outerPlan);
+                       if (!TupIsNull(outerslot))
+                       {
+                               /*
+                                * Make a copy of the first input tuple; we will use this
+                                * for comparisons (in group mode) and for projection.
+                                */
+                               aggstate->grp_firstTuple = heap_copytuple(outerslot->val);
+                       }
+                       else
+                       {
+                               /* outer plan produced no tuples at all */
+                               aggstate->agg_done = true;
+                               /* If we are grouping, we should produce no tuples too */
+                               if (node->aggstrategy != AGG_PLAIN)
+                                       return NULL;
+                       }
                 }
  
-               inputTuple = NULL;              /* no saved input tuple yet */
+               /*
+                * Clear the per-output-tuple context for each group
+                */
+               ResetExprContext(econtext);
  
                 /*
-                * for each tuple from the outer plan, update all the aggregates
+                * Initialize working state for a new input tuple group
                  */
-               for (;;)
+               initialize_aggregates(aggstate, peragg, pergroup);
+
+               if (aggstate->grp_firstTuple != NULL)
                 {
-                       TupleTableSlot *outerslot;
+                       /*
+                        * Store the copied first input tuple in the tuple table slot
+                        * reserved for it.  The tuple will be deleted when it is
+                        * cleared from the slot.
+                        */
+                       ExecStoreTuple(aggstate->grp_firstTuple,
+                                                  firstSlot,
+                                                  InvalidBuffer,
+                                                  true);
+                       aggstate->grp_firstTuple = NULL;        /* don't keep two pointers */
  
-                       outerslot = ExecProcNode(outerPlan, (Plan *) node);
-                       if (TupIsNull(outerslot))
-                               break;
-                       econtext->ecxt_scantuple = outerslot;
+                       /* set up for first advance_aggregates call */
+                       tmpcontext->ecxt_scantuple = firstSlot;
  
                         /*
-                        * Clear and select the current working context for evaluation
-                        * of the input expressions and transition functions at this
-                        * input tuple.
+                        * Process each outer-plan tuple, and then fetch the next one,
+                        * until we exhaust the outer plan or cross a group boundary.
                          */
-                       econtext->ecxt_per_tuple_memory =
-                               aggstate->agg_cxt[aggstate->which_cxt];
-                       ResetExprContext(econtext);
-                       oldContext =
-                               MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
-
-                       for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+                       for (;;)
                         {
-                               AggStatePerAgg peraggstate = &peragg[aggno];
-                               Aggref     *aggref = peraggstate->aggref;
-                               Datum           newVal;
+                               advance_aggregates(aggstate, pergroup);
  
-                               newVal = ExecEvalExpr(aggref->target, econtext,
-                                                                         &isNull, NULL);
+                               /* Reset per-input-tuple context after each tuple */
+                               ResetExprContext(tmpcontext);
  
-                               if (aggref->aggdistinct)
+                               outerslot = ExecProcNode(outerPlan);
+                               if (TupIsNull(outerslot))
                                 {
-                                       /* in DISTINCT mode, we may ignore nulls */
-                                       if (isNull)
-                                               continue;
-                                       /* putdatum has to be called in per-query context */
-                                       MemoryContextSwitchTo(oldContext);
-                                       tuplesort_putdatum(peraggstate->sortstate,
-                                                                          newVal, isNull);
-                                       MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+                                       /* no more outer-plan tuples available */
+                                       aggstate->agg_done = true;
+                                       break;
                                 }
-                               else
+                               /* set up for next advance_aggregates call */
+                               tmpcontext->ecxt_scantuple = outerslot;
+
+                               /*
+                                * If we are grouping, check whether we've crossed a group
+                                * boundary.
+                                */
+                               if (node->aggstrategy == AGG_SORTED)
                                 {
-                                       advance_transition_function(peraggstate,
-                                                                                               newVal, isNull);
+                                       if (!execTuplesMatch(firstSlot->val,
+                                                                                outerslot->val,
+                                                                                firstSlot->ttc_tupleDescriptor,
+                                                                                node->numCols, node->grpColIdx,
+                                                                                aggstate->eqfunctions,
+                                                                         tmpcontext->ecxt_per_tuple_memory))
+                                       {
+                                               /*
+                                                * Save the first input tuple of the next group.
+                                                */
+                                               aggstate->grp_firstTuple = heap_copytuple(outerslot->val);
+                                               break;
+                                       }
                                 }
                         }
-
-                       /*
-                        * Make the other context current so that these transition
-                        * results are preserved.
-                        */
-                       aggstate->which_cxt = 1 - aggstate->which_cxt;
-
-                       MemoryContextSwitchTo(oldContext);
-
-                       /*
-                        * Keep a copy of the first input tuple for the projection.
-                        * (We only need one since only the GROUP BY columns in it can
-                        * be referenced, and these will be the same for all tuples
-                        * aggregated over.)
-                        */
-                       if (!inputTuple)
-                               inputTuple = heap_copytuple(outerslot->val);
                 }
  
                 /*
                  * Done scanning input tuple group. Finalize each aggregate
                  * calculation, and stash results in the per-output-tuple context.
-                *
-                * This is a bit tricky when there are both DISTINCT and plain
-                * aggregates: we must first finalize all the plain aggs and then
-                * all the DISTINCT ones.  This is needed because the last
-                * transition values for the plain aggs are stored in the
-                * not-current working context, and we have to evaluate those aggs
-                * (and stash the results in the output tup_cxt!) before we start
-                * flipping contexts again in process_sorted_aggregate.
                  */
-               oldContext = MemoryContextSwitchTo(aggstate->tup_cxt);
                 for (aggno = 0; aggno < aggstate->numaggs; aggno++)
                 {
                         AggStatePerAgg peraggstate = &peragg[aggno];
+                       AggStatePerGroup pergroupstate = &pergroup[aggno];
+
+                       if (peraggstate->aggref->aggdistinct)
+                               process_sorted_aggregate(aggstate, peraggstate, pergroupstate);
  
-                       if (!peraggstate->aggref->aggdistinct)
-                               finalize_aggregate(peraggstate,
-                                                                  &aggvalues[aggno], &aggnulls[aggno]);
+                       finalize_aggregate(aggstate, peraggstate, pergroupstate,
+                                                          &aggvalues[aggno], &aggnulls[aggno]);
                 }
-               MemoryContextSwitchTo(oldContext);
-               for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+
+               /*
+                * If we have no first tuple (ie, the outerPlan didn't return
+                * anything), create a dummy all-nulls input tuple for use by
+                * ExecProject. 99.44% of the time this is a waste of cycles,
+                * because ordinarily the projected output tuple's targetlist
+                * cannot contain any direct (non-aggregated) references to input
+                * columns, so the dummy tuple will not be referenced. However
+                * there are special cases where this isn't so --- in particular
+                * an UPDATE involving an aggregate will have a targetlist
+                * reference to ctid.  We need to return a null for ctid in that
+                * situation, not coredump.
+                *
+                * The values returned for the aggregates will be the initial values
+                * of the transition functions.
+                */
+               if (TupIsNull(firstSlot))
                 {
-                       AggStatePerAgg peraggstate = &peragg[aggno];
+                       TupleDesc       tupType;
  
-                       if (peraggstate->aggref->aggdistinct)
+                       /* Should only happen in non-grouped mode */
+                       Assert(node->aggstrategy == AGG_PLAIN);
+                       Assert(aggstate->agg_done);
+
+                       tupType = firstSlot->ttc_tupleDescriptor;
+                       /* watch out for zero-column input tuples, though... */
+                       if (tupType && tupType->natts > 0)
                         {
-                               process_sorted_aggregate(aggstate, peraggstate);
-                               oldContext = MemoryContextSwitchTo(aggstate->tup_cxt);
-                               finalize_aggregate(peraggstate,
-                                                                  &aggvalues[aggno], &aggnulls[aggno]);
-                               MemoryContextSwitchTo(oldContext);
+                               HeapTuple       nullsTuple;
+                               Datum      *dvalues;
+                               char       *dnulls;
+
+                               dvalues = (Datum *) palloc0(sizeof(Datum) * tupType->natts);
+                               dnulls = (char *) palloc(sizeof(char) * tupType->natts);
+                               MemSet(dnulls, 'n', sizeof(char) * tupType->natts);
+                               nullsTuple = heap_formtuple(tupType, dvalues, dnulls);
+                               ExecStoreTuple(nullsTuple,
+                                                          firstSlot,
+                                                          InvalidBuffer,
+                                                          true);
+                               pfree(dvalues);
+                               pfree(dnulls);
                         }
                 }
  
                 /*
-                * If the outerPlan is a Group node, we will reach here after each
-                * group.  We are not done unless the Group node is done (a little
-                * ugliness here while we reach into the Group's state to find
-                * out). Furthermore, when grouping we return nothing at all
-                * unless we had some input tuple(s).  By the nature of Group,
-                * there are no empty groups, so if we get here with no input the
-                * whole scan is empty.
-                *
-                * If the outerPlan isn't a Group, we are done when we get here, and
-                * we will emit a (single) tuple even if there were no input
-                * tuples.
+                * Form a projection tuple using the aggregate results and the
+                * representative input tuple.  Store it in the result tuple slot.
+                * Note we do not support aggregates returning sets ...
                  */
-               if (IsA(outerPlan, Group))
+               econtext->ecxt_scantuple = firstSlot;
+               resultSlot = ExecProject(projInfo, NULL);
+
+               /*
+                * If the completed tuple does not match the qualifications, it is
+                * ignored and we loop back to try to process another group.
+                * Otherwise, return the tuple.
+                */
+       }
+       while (!ExecQual(aggstate->ss.ps.qual, econtext, false));
+
+       return resultSlot;
+}
+
+/*
+ * ExecAgg for hashed case: phase 1, read input and build hash table
+ */
+static void
+agg_fill_hash_table(AggState *aggstate)
+{
+       PlanState  *outerPlan;
+       ExprContext *tmpcontext;
+       AggHashEntry entry;
+       TupleTableSlot *outerslot;
+
+       /*
+        * get state info from node
+        */
+       outerPlan = outerPlanState(aggstate);
+       /* tmpcontext is the per-input-tuple expression context */
+       tmpcontext = aggstate->tmpcontext;
+
+       /*
+        * Process each outer-plan tuple, and then fetch the next one, until
+        * we exhaust the outer plan.
+        */
+       for (;;)
+       {
+               outerslot = ExecProcNode(outerPlan);
+               if (TupIsNull(outerslot))
+                       break;
+               /* set up for advance_aggregates call */
+               tmpcontext->ecxt_scantuple = outerslot;
+
+               /* Find or build hashtable entry for this tuple's group */
+               entry = lookup_hash_entry(aggstate, outerslot);
+
+               /* Advance the aggregates */
+               advance_aggregates(aggstate, entry->pergroup);
+
+               /* Reset per-input-tuple context after each tuple */
+               ResetExprContext(tmpcontext);
+       }
+
+       aggstate->table_filled = true;
+       /* Initialize to walk the hash table */
+       ResetTupleHashIterator(aggstate->hashtable, &aggstate->hashiter);
+}
+
+/*
+ * ExecAgg for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+agg_retrieve_hash_table(AggState *aggstate)
+{
+       ExprContext *econtext;
+       ProjectionInfo *projInfo;
+       Datum      *aggvalues;
+       bool       *aggnulls;
+       AggStatePerAgg peragg;
+       AggStatePerGroup pergroup;
+       AggHashEntry entry;
+       TupleTableSlot *firstSlot;
+       TupleTableSlot *resultSlot;
+       int                     aggno;
+
+       /*
+        * get state info from node
+        */
+       /* econtext is the per-output-tuple expression context */
+       econtext = aggstate->ss.ps.ps_ExprContext;
+       aggvalues = econtext->ecxt_aggvalues;
+       aggnulls = econtext->ecxt_aggnulls;
+       projInfo = aggstate->ss.ps.ps_ProjInfo;
+       peragg = aggstate->peragg;
+       firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+       /*
+        * We loop retrieving groups until we find one satisfying
+        * aggstate->ss.ps.qual
+        */
+       do
+       {
+               if (aggstate->agg_done)
+                       return NULL;
+
+               /*
+                * Find the next entry in the hash table
+                */
+               entry = (AggHashEntry) ScanTupleHashTable(&aggstate->hashiter);
+               if (entry == NULL)
                 {
-                       /* aggregation over groups */
-                       aggstate->agg_done = ((Group *) outerPlan)->grpstate->grp_done;
-                       /* check for no groups */
-                       if (inputTuple == NULL)
-                               return NULL;
+                       /* No more entries in hashtable, so done */
+                       aggstate->agg_done = TRUE;
+                       return NULL;
                 }
-               else
-               {
-                       aggstate->agg_done = true;
  
-                       /*
-                        * If inputtuple==NULL (ie, the outerPlan didn't return
-                        * anything), create a dummy all-nulls input tuple for use by
-                        * ExecProject. 99.44% of the time this is a waste of cycles,
-                        * because ordinarily the projected output tuple's targetlist
-                        * cannot contain any direct (non-aggregated) references to
-                        * input columns, so the dummy tuple will not be referenced.
-                        * However there are special cases where this isn't so --- in
-                        * particular an UPDATE involving an aggregate will have a
-                        * targetlist reference to ctid.  We need to return a null for
-                        * ctid in that situation, not coredump.
-                        *
-                        * The values returned for the aggregates will be the initial
-                        * values of the transition functions.
-                        */
-                       if (inputTuple == NULL)
-                       {
-                               TupleDesc       tupType;
-                               Datum      *tupValue;
-                               char       *null_array;
-                               AttrNumber      attnum;
-
-                               tupType = aggstate->csstate.css_ScanTupleSlot->ttc_tupleDescriptor;
-                               tupValue = projInfo->pi_tupValue;
-                               /* watch out for null input tuples, though... */
-                               if (tupType && tupValue)
-                               {
-                                       null_array = (char *) palloc(sizeof(char) * tupType->natts);
-                                       for (attnum = 0; attnum < tupType->natts; attnum++)
-                                               null_array[attnum] = 'n';
-                                       inputTuple = heap_formtuple(tupType, tupValue, null_array);
-                                       pfree(null_array);
-                               }
-                       }
-               }
+               /*
+                * Clear the per-output-tuple context for each group
+                */
+               ResetExprContext(econtext);
  
                 /*
-                * Store the representative input tuple in the tuple table slot
-                * reserved for it.  The tuple will be deleted when it is cleared
-                * from the slot.
+                * Store the copied first input tuple in the tuple table slot
+                * reserved for it, so that it can be used in ExecProject.
                  */
-               ExecStoreTuple(inputTuple,
-                                          aggstate->csstate.css_ScanTupleSlot,
+               ExecStoreTuple(entry->shared.firstTuple,
+                                          firstSlot,
                                            InvalidBuffer,
-                                          true);
-               econtext->ecxt_scantuple = aggstate->csstate.css_ScanTupleSlot;
+                                          false);
+
+               pergroup = entry->pergroup;
  
                 /*
-                * Do projection and qual check in the per-output-tuple context.
+                * Finalize each aggregate calculation, and stash results in the
+                * per-output-tuple context.
                  */
-               econtext->ecxt_per_tuple_memory = aggstate->tup_cxt;
+               for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+               {
+                       AggStatePerAgg peraggstate = &peragg[aggno];
+                       AggStatePerGroup pergroupstate = &pergroup[aggno];
+
+                       Assert(!peraggstate->aggref->aggdistinct);
+                       finalize_aggregate(aggstate, peraggstate, pergroupstate,
+                                                          &aggvalues[aggno], &aggnulls[aggno]);
+               }
  
                 /*
                  * Form a projection tuple using the aggregate results and the
                  * representative input tuple.  Store it in the result tuple slot.
                  * Note we do not support aggregates returning sets ...
                  */
+               econtext->ecxt_scantuple = firstSlot;
                 resultSlot = ExecProject(projInfo, NULL);
  
                 /*
@@ -705,7 +1000,7 @@ ExecAgg(Agg *node)
                  * Otherwise, return the tuple.
                  */
         }
-       while (!ExecQual(node->plan.qual, econtext, false));
+       while (!ExecQual(aggstate->ss.ps.qual, econtext, false));
  
         return resultSlot;
  }
@@ -717,8 +1012,8 @@ ExecAgg(Agg *node)
   *     planner and initializes its outer subtree
   * -----------------
   */
-bool
-ExecInitAgg(Agg *node, EState *estate, Plan *parent)
+AggState *
+ExecInitAgg(Agg *node, EState *estate)
  {
         AggState   *aggstate;
         AggStatePerAgg peragg;
@@ -728,138 +1023,214 @@ ExecInitAgg(Agg *node, EState *estate, Plan *parent)
                                 aggno;
         List       *alist;
  
-       /*
-        * assign the node's execution state
-        */
-       node->plan.state = estate;
-
         /*
          * create state structure
          */
         aggstate = makeNode(AggState);
-       node->aggstate = aggstate;
+       aggstate->ss.ps.plan = (Plan *) node;
+       aggstate->ss.ps.state = estate;
+
+       aggstate->aggs = NIL;
+       aggstate->numaggs = 0;
+       aggstate->eqfunctions = NULL;
+       aggstate->hashfunctions = NULL;
+       aggstate->peragg = NULL;
         aggstate->agg_done = false;
+       aggstate->pergroup = NULL;
+       aggstate->grp_firstTuple = NULL;
+       aggstate->hashtable = NULL;
  
         /*
-        * find aggregates in targetlist and quals
-        *
-        * Note: pull_agg_clauses also checks that no aggs contain other agg
-        * calls in their arguments.  This would make no sense under SQL
-        * semantics anyway (and it's forbidden by the spec).  Because that is
-        * true, we don't need to worry about evaluating the aggs in any
-        * particular order.
+        * Create expression contexts.  We need two, one for per-input-tuple
+        * processing and one for per-output-tuple processing.  We cheat a
+        * little by using ExecAssignExprContext() to build both.
          */
-       aggstate->aggs = nconc(pull_agg_clause((Node *) node->plan.targetlist),
-                                                  pull_agg_clause((Node *) node->plan.qual));
-       aggstate->numaggs = numaggs = length(aggstate->aggs);
-       if (numaggs <= 0)
-       {
-               /*
-                * This used to be treated as an error, but we can't do that
-                * anymore because constant-expression simplification could
-                * optimize away all of the Aggrefs in the targetlist and qual.
-                * So, just make a debug note, and force numaggs positive so that
-                * palloc()s below don't choke.
-                */
-               elog(LOG, "ExecInitAgg: could not find any aggregate functions");
-               numaggs = 1;
-       }
-
-       /*
-        * Create expression context
-        */
-       ExecAssignExprContext(estate, &aggstate->csstate.cstate);
+       ExecAssignExprContext(estate, &aggstate->ss.ps);
+       aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
+       ExecAssignExprContext(estate, &aggstate->ss.ps);
  
         /*
-        * We actually need three separate expression memory contexts: one for
-        * calculating per-output-tuple values (ie, the finished aggregate
-        * results), and two that we ping-pong between for per-input-tuple
-        * evaluation of input expressions and transition functions.  The
-        * context made by ExecAssignExprContext() is used as the output
-        * context.
+        * We also need a long-lived memory context for holding hashtable data
+        * structures and transition values.  NOTE: the details of what is
+        * stored in aggcontext and what is stored in the regular per-query
+        * memory context are driven by a simple decision: we want to reset
+        * the aggcontext in ExecReScanAgg to recover no-longer-wanted space.
          */
-       aggstate->tup_cxt =
-               aggstate->csstate.cstate.cs_ExprContext->ecxt_per_tuple_memory;
-       aggstate->agg_cxt[0] =
-               AllocSetContextCreate(CurrentMemoryContext,
-                                                         "AggExprContext1",
-                                                         ALLOCSET_DEFAULT_MINSIZE,
-                                                         ALLOCSET_DEFAULT_INITSIZE,
-                                                         ALLOCSET_DEFAULT_MAXSIZE);
-       aggstate->agg_cxt[1] =
+       aggstate->aggcontext =
                 AllocSetContextCreate(CurrentMemoryContext,
-                                                         "AggExprContext2",
+                                                         "AggContext",
                                                           ALLOCSET_DEFAULT_MINSIZE,
                                                           ALLOCSET_DEFAULT_INITSIZE,
                                                           ALLOCSET_DEFAULT_MAXSIZE);
-       aggstate->which_cxt = 0;
  
  #define AGG_NSLOTS 2
  
         /*
          * tuple table initialization
          */
-       ExecInitScanTupleSlot(estate, &aggstate->csstate);
-       ExecInitResultTupleSlot(estate, &aggstate->csstate.cstate);
+       ExecInitScanTupleSlot(estate, &aggstate->ss);
+       ExecInitResultTupleSlot(estate, &aggstate->ss.ps);
  
         /*
-        * Set up aggregate-result storage in the expr context, and also
-        * allocate my private per-agg working storage
+        * initialize child expressions
+        *
+        * Note: ExecInitExpr finds Aggrefs for us, and also checks that no aggs
+        * contain other agg calls in their arguments.  This would make no
+        * sense under SQL semantics anyway (and it's forbidden by the spec).
+        * Because that is true, we don't need to worry about evaluating the
+        * aggs in any particular order.
          */
-       econtext = aggstate->csstate.cstate.cs_ExprContext;
-       econtext->ecxt_aggvalues = (Datum *) palloc(sizeof(Datum) * numaggs);
-       MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * numaggs);
-       econtext->ecxt_aggnulls = (bool *) palloc(sizeof(bool) * numaggs);
-       MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * numaggs);
-
-       peragg = (AggStatePerAgg) palloc(sizeof(AggStatePerAggData) * numaggs);
-       MemSet(peragg, 0, sizeof(AggStatePerAggData) * numaggs);
-       aggstate->peragg = peragg;
+       aggstate->ss.ps.targetlist = (List *)
+               ExecInitExpr((Expr *) node->plan.targetlist,
+                                        (PlanState *) aggstate);
+       aggstate->ss.ps.qual = (List *)
+               ExecInitExpr((Expr *) node->plan.qual,
+                                        (PlanState *) aggstate);
  
         /*
          * initialize child nodes
          */
         outerPlan = outerPlan(node);
-       ExecInitNode(outerPlan, estate, (Plan *) node);
+       outerPlanState(aggstate) = ExecInitNode(outerPlan, estate);
  
         /*
          * initialize source tuple type.
          */
-       ExecAssignScanTypeFromOuterPlan((Plan *) node, &aggstate->csstate);
+       ExecAssignScanTypeFromOuterPlan(&aggstate->ss);
  
         /*
          * Initialize result tuple type and projection info.
          */
-       ExecAssignResultTypeFromTL((Plan *) node, &aggstate->csstate.cstate);
-       ExecAssignProjectionInfo((Plan *) node, &aggstate->csstate.cstate);
+       ExecAssignResultTypeFromTL(&aggstate->ss.ps);
+       ExecAssignProjectionInfo(&aggstate->ss.ps);
+
+       /*
+        * get the count of aggregates in targetlist and quals
+        */
+       numaggs = aggstate->numaggs;
+       Assert(numaggs == length(aggstate->aggs));
+       if (numaggs <= 0)
+       {
+               /*
+                * This is not an error condition: we might be using the Agg node
+                * just to do hash-based grouping.      Even in the regular case,
+                * constant-expression simplification could optimize away all of
+                * the Aggrefs in the targetlist and qual.      So keep going, but
+                * force local copy of numaggs positive so that palloc()s below
+                * don't choke.
+                */
+               numaggs = 1;
+       }
+
+       /*
+        * If we are grouping, precompute fmgr lookup data for inner loop. We
+        * need both equality and hashing functions to do it by hashing, but
+        * only equality if not hashing.
+        */
+       if (node->numCols > 0)
+       {
+               if (node->aggstrategy == AGG_HASHED)
+                       execTuplesHashPrepare(ExecGetScanType(&aggstate->ss),
+                                                                 node->numCols,
+                                                                 node->grpColIdx,
+                                                                 &aggstate->eqfunctions,
+                                                                 &aggstate->hashfunctions);
+               else
+                       aggstate->eqfunctions =
+                               execTuplesMatchPrepare(ExecGetScanType(&aggstate->ss),
+                                                                          node->numCols,
+                                                                          node->grpColIdx);
+       }
+
+       /*
+        * Set up aggregate-result storage in the output expr context, and
+        * also allocate my private per-agg working storage
+        */
+       econtext = aggstate->ss.ps.ps_ExprContext;
+       econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
+       econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
+
+       peragg = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
+       aggstate->peragg = peragg;
+
+       if (node->aggstrategy == AGG_HASHED)
+       {
+               build_hash_table(aggstate);
+               aggstate->table_filled = false;
+       }
+       else
+       {
+               AggStatePerGroup pergroup;
+
+               pergroup = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData) * numaggs);
+               aggstate->pergroup = pergroup;
+       }
  
         /*
          * Perform lookups of aggregate function info, and initialize the
-        * unchanging fields of the per-agg data
+        * unchanging fields of the per-agg data.  We also detect duplicate
+        * aggregates (for example, "SELECT sum(x) ... HAVING sum(x) > 0").
+        * When duplicates are detected, we only make an AggStatePerAgg struct
+        * for the first one.  The clones are simply pointed at the same
+        * result entry by giving them duplicate aggno values.
          */
         aggno = -1;
         foreach(alist, aggstate->aggs)
         {
-               Aggref     *aggref = (Aggref *) lfirst(alist);
-               AggStatePerAgg peraggstate = &peragg[++aggno];
+               AggrefExprState *aggrefstate = (AggrefExprState *) lfirst(alist);
+               Aggref     *aggref = (Aggref *) aggrefstate->xprstate.expr;
+               AggStatePerAgg peraggstate;
+               Oid                     inputType;
                 HeapTuple       aggTuple;
                 Form_pg_aggregate aggform;
+               Oid                     aggtranstype;
                 AclResult       aclresult;
                 Oid                     transfn_oid,
                                         finalfn_oid;
+               Expr       *transfnexpr,
+                                  *finalfnexpr;
                 Datum           textInitVal;
+               int                     i;
+
+               /* Planner should have assigned aggregate to correct level */
+               Assert(aggref->agglevelsup == 0);
+
+               /* Look for a previous duplicate aggregate */
+               for (i = 0; i <= aggno; i++)
+               {
+                       if (equal(aggref, peragg[i].aggref) &&
+                               !contain_volatile_functions((Node *) aggref))
+                               break;
+               }
+               if (i <= aggno)
+               {
+                       /* Found a match to an existing entry, so just mark it */
+                       aggrefstate->aggno = i;
+                       continue;
+               }
+
+               /* Nope, so assign a new PerAgg record */
+               peraggstate = &peragg[++aggno];
  
-               /* Mark Aggref node with its associated index in the result array */
-               aggref->aggno = aggno;
+               /* Mark Aggref state node with assigned index in the result array */
+               aggrefstate->aggno = aggno;
  
                 /* Fill in the peraggstate data */
+               peraggstate->aggrefstate = aggrefstate;
                 peraggstate->aggref = aggref;
  
+               /*
+                * Get actual datatype of the input.  We need this because it may
+                * be different from the agg's declared input type, when the agg
+                * accepts ANY (eg, COUNT(*)) or ANYARRAY or ANYELEMENT.
+                */
+               inputType = exprType((Node *) aggref->target);
+
                 aggTuple = SearchSysCache(AGGFNOID,
                                                                   ObjectIdGetDatum(aggref->aggfnoid),
                                                                   0, 0, 0);
                 if (!HeapTupleIsValid(aggTuple))
-                       elog(ERROR, "ExecAgg: cache lookup failed for aggregate %u",
+                       elog(ERROR, "cache lookup failed for aggregate %u",
                                  aggref->aggfnoid);
                 aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
  
@@ -867,18 +1238,56 @@ ExecInitAgg(Agg *node, EState *estate, Plan *parent)
                 aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
                                                                          ACL_EXECUTE);
                 if (aclresult != ACLCHECK_OK)
-                       aclcheck_error(aclresult, get_func_name(aggref->aggfnoid));
+                       aclcheck_error(aclresult, ACL_KIND_PROC,
+                                                  get_func_name(aggref->aggfnoid));
+
+               peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
+               peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+
+               /* resolve actual type of transition state, if polymorphic */
+               aggtranstype = aggform->aggtranstype;
+               if (aggtranstype == ANYARRAYOID || aggtranstype == ANYELEMENTOID)
+               {
+                       /* have to fetch the agg's declared input type... */
+                       Oid                     agg_arg_types[FUNC_MAX_ARGS];
+                       int                     agg_nargs;
+
+                       (void) get_func_signature(aggref->aggfnoid,
+                                                                         agg_arg_types, &agg_nargs);
+                       Assert(agg_nargs == 1);
+                       aggtranstype = resolve_generic_type(aggtranstype,
+                                                                                               inputType,
+                                                                                               agg_arg_types[0]);
+               }
+
+               /* build expression trees using actual argument & result types */
+               build_aggregate_fnexprs(inputType,
+                                                               aggtranstype,
+                                                               aggref->aggtype,
+                                                               transfn_oid,
+                                                               finalfn_oid,
+                                                               &transfnexpr,
+                                                               &finalfnexpr);
+
+               fmgr_info(transfn_oid, &peraggstate->transfn);
+               peraggstate->transfn.fn_expr = (Node *) transfnexpr;
+
+               if (OidIsValid(finalfn_oid))
+               {
+                       fmgr_info(finalfn_oid, &peraggstate->finalfn);
+                       peraggstate->finalfn.fn_expr = (Node *) finalfnexpr;
+               }
  
                 get_typlenbyval(aggref->aggtype,
                                                 &peraggstate->resulttypeLen,
                                                 &peraggstate->resulttypeByVal);
-               get_typlenbyval(aggform->aggtranstype,
+               get_typlenbyval(aggtranstype,
                                                 &peraggstate->transtypeLen,
                                                 &peraggstate->transtypeByVal);
  
                 /*
-                * initval is potentially null, so don't try to access it as a struct
-                * field. Must do it the hard way with SysCacheGetAttr.
+                * initval is potentially null, so don't try to access it as a
+                * struct field. Must do it the hard way with SysCacheGetAttr.
                  */
                 textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
                                                                           Anum_pg_aggregate_agginitval,
@@ -888,14 +1297,7 @@ ExecInitAgg(Agg *node, EState *estate, Plan *parent)
                         peraggstate->initValue = (Datum) 0;
                 else
                         peraggstate->initValue = GetAggInitVal(textInitVal,
-                                                                                                  aggform->aggtranstype);
-
-               peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
-               peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
-
-               fmgr_info(transfn_oid, &peraggstate->transfn);
-               if (OidIsValid(finalfn_oid))
-                       fmgr_info(finalfn_oid, &peraggstate->finalfn);
+                                                                                                  aggtranstype);
  
                 /*
                  * If the transfn is strict and the initval is NULL, make sure
@@ -906,48 +1308,38 @@ ExecInitAgg(Agg *node, EState *estate, Plan *parent)
                  */
                 if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull)
                 {
-                       /*
-                        * Note: use the type from the input expression here, not
-                        * from pg_proc.proargtypes, because the latter might be 0.
-                        * (Consider COUNT(*).)
-                        */
-                       Oid                     inputType = exprType(aggref->target);
-
-                       if (!IsBinaryCompatible(inputType, aggform->aggtranstype))
-                               elog(ERROR, "Aggregate %u needs to have compatible input type and transition type",
-                                        aggref->aggfnoid);
+                       if (!IsBinaryCoercible(inputType, aggtranstype))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+                                                errmsg("aggregate %u needs to have compatible input type and transition type",
+                                                               aggref->aggfnoid)));
                 }
  
                 if (aggref->aggdistinct)
                 {
-                       /*
-                        * Note: use the type from the input expression here, not
-                        * from pg_proc.proargtypes, because the latter might be 0.
-                        * (Consider COUNT(*).)
-                        */
-                       Oid                     inputType = exprType(aggref->target);
                         Oid                     eq_function;
  
+                       /* We don't implement DISTINCT aggs in the HASHED case */
+                       Assert(node->aggstrategy != AGG_HASHED);
+
                         peraggstate->inputType = inputType;
                         get_typlenbyval(inputType,
                                                         &peraggstate->inputtypeLen,
                                                         &peraggstate->inputtypeByVal);
  
-                       eq_function = compatible_oper_funcid(makeList1(makeString("=")),
-                                                                                                inputType, inputType,
-                                                                                                true);
-                       if (!OidIsValid(eq_function))
-                               elog(ERROR, "Unable to identify an equality operator for type %s",
-                                        format_type_be(inputType));
+                       eq_function = equality_oper_funcid(inputType);
                         fmgr_info(eq_function, &(peraggstate->equalfn));
-                       peraggstate->sortOperator = any_ordering_op(inputType);
+                       peraggstate->sortOperator = ordering_oper_opid(inputType);
                         peraggstate->sortstate = NULL;
                 }
  
                 ReleaseSysCache(aggTuple);
         }
  
-       return TRUE;
+       /* Update numaggs to match number of unique aggregates found */
+       aggstate->numaggs = aggno + 1;
+
+       return aggstate;
  }
  
  static Datum
@@ -965,7 +1357,7 @@ GetAggInitVal(Datum textInitVal, Oid transtype)
                                                  ObjectIdGetDatum(transtype),
                                                  0, 0, 0);
         if (!HeapTupleIsValid(tup))
-               elog(ERROR, "GetAggInitVal: cache lookup failed on aggregate transition function return type %u", transtype);
+               elog(ERROR, "cache lookup failed for type %u", transtype);
  
         typinput = ((Form_pg_type) GETSTRUCT(tup))->typinput;
         typelem = ((Form_pg_type) GETSTRUCT(tup))->typelem;
@@ -989,49 +1381,104 @@ ExecCountSlotsAgg(Agg *node)
  }
  
  void
-ExecEndAgg(Agg *node)
+ExecEndAgg(AggState *node)
  {
-       AggState   *aggstate = node->aggstate;
-       Plan       *outerPlan;
+       PlanState  *outerPlan;
+       int                     aggno;
  
-       ExecFreeProjectionInfo(&aggstate->csstate.cstate);
+       /* Make sure we have closed any open tuplesorts */
+       for (aggno = 0; aggno < node->numaggs; aggno++)
+       {
+               AggStatePerAgg peraggstate = &node->peragg[aggno];
  
-       /*
-        * Make sure ExecFreeExprContext() frees the right expr context...
-        */
-       aggstate->csstate.cstate.cs_ExprContext->ecxt_per_tuple_memory =
-               aggstate->tup_cxt;
-       ExecFreeExprContext(&aggstate->csstate.cstate);
+               if (peraggstate->sortstate)
+                       tuplesort_end(peraggstate->sortstate);
+       }
  
         /*
-        * ... and I free the others.
+        * Free both the expr contexts.
          */
-       MemoryContextDelete(aggstate->agg_cxt[0]);
-       MemoryContextDelete(aggstate->agg_cxt[1]);
-
-       outerPlan = outerPlan(node);
-       ExecEndNode(outerPlan, (Plan *) node);
+       ExecFreeExprContext(&node->ss.ps);
+       node->ss.ps.ps_ExprContext = node->tmpcontext;
+       ExecFreeExprContext(&node->ss.ps);
  
         /* clean up tuple table */
-       ExecClearTuple(aggstate->csstate.css_ScanTupleSlot);
+       ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+       MemoryContextDelete(node->aggcontext);
+
+       outerPlan = outerPlanState(node);
+       ExecEndNode(outerPlan);
  }
  
  void
-ExecReScanAgg(Agg *node, ExprContext *exprCtxt, Plan *parent)
+ExecReScanAgg(AggState *node, ExprContext *exprCtxt)
  {
-       AggState   *aggstate = node->aggstate;
-       ExprContext *econtext = aggstate->csstate.cstate.cs_ExprContext;
+       ExprContext *econtext = node->ss.ps.ps_ExprContext;
+       int                     aggno;
  
-       aggstate->agg_done = false;
-       MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * aggstate->numaggs);
-       MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * aggstate->numaggs);
+       node->agg_done = false;
+
+       if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
+       {
+               /*
+                * In the hashed case, if we haven't yet built the hash table then
+                * we can just return; nothing done yet, so nothing to undo. If
+                * subnode's chgParam is not NULL then it will be re-scanned by
+                * ExecProcNode, else no reason to re-scan it at all.
+                */
+               if (!node->table_filled)
+                       return;
+
+               /*
+                * If we do have the hash table and the subplan does not have any
+                * parameter changes, then we can just rescan the existing hash
+                * table; no need to build it again.
+                */
+               if (((PlanState *) node)->lefttree->chgParam == NULL)
+               {
+                       ResetTupleHashIterator(node->hashtable, &node->hashiter);
+                       return;
+               }
+       }
+
+       /* Make sure we have closed any open tuplesorts */
+       for (aggno = 0; aggno < node->numaggs; aggno++)
+       {
+               AggStatePerAgg peraggstate = &node->peragg[aggno];
+
+               if (peraggstate->sortstate)
+                       tuplesort_end(peraggstate->sortstate);
+               peraggstate->sortstate = NULL;
+       }
+
+       /* Release first tuple of group, if we have made a copy */
+       if (node->grp_firstTuple != NULL)
+       {
+               heap_freetuple(node->grp_firstTuple);
+               node->grp_firstTuple = NULL;
+       }
+
+       /* Forget current agg values */
+       MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
+       MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
+
+       /* Release all temp storage */
+       MemoryContextReset(node->aggcontext);
+
+       if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
+       {
+               /* Rebuild an empty hash table */
+               build_hash_table(node);
+               node->table_filled = false;
+       }
  
         /*
          * if chgParam of subnode is not null then plan will be re-scanned by
          * first ExecProcNode.
          */
-       if (((Plan *) node)->lefttree->chgParam == NULL)
-               ExecReScan(((Plan *) node)->lefttree, exprCtxt, (Plan *) node);
+       if (((PlanState *) node)->lefttree->chgParam == NULL)
+               ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
  }
  
  /*
@@ -1047,7 +1494,7 @@ ExecReScanAgg(Agg *node, ExprContext *exprCtxt, Plan *parent)
  Datum
  aggregate_dummy(PG_FUNCTION_ARGS)
  {
-       elog(ERROR, "Aggregate function %u called as normal function",
+       elog(ERROR, "aggregate function %u called as normal function",
                  fcinfo->flinfo->fn_oid);
         return (Datum) 0;                       /* keep compiler quiet */
  }