* Routines to handle INTERSECT and EXCEPT selection
*
* The input of a SetOp node consists of tuples from two relations,
- * which have been combined into one dataset and sorted on all the nonjunk
- * attributes. In addition there is a junk attribute that shows which
- * relation each tuple came from. The SetOp node scans each group of
- * identical tuples to determine how many came from each input relation.
- * Then it is a simple matter to emit the output demanded by the SQL spec
- * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from. In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes). The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation. Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order.
+ * We build a hash table in memory with one entry for each group of
+ * identical tuples, and count the number of tuples in the group from
+ * each relation. After seeing all the input, we scan the hashtable and
+ * generate the correct output using those counts.
*
* This node type is not used for UNION or UNION ALL, since those can be
* implemented more cheaply (there's no need for the junk attribute to
* identify the source relation).
*
+ * Note that SetOp does no qual checking nor projection. The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
*
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-/*
- * INTERFACE ROUTINES
- * ExecSetOp - filter input to generate INTERSECT/EXCEPT results
- * ExecInitSetOp - initialize node and subnodes..
- * ExecEndSetOp - shutdown node and subnodes
- */
#include "postgres.h"
#include "utils/memutils.h"
+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node. In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+ long numLeft; /* number of left-input dups in group */
+ long numRight; /* number of right-input dups in group */
+} SetOpStatePerGroupData;
+
+/*
+ * To implement hashed mode, we need a hashtable that stores a
+ * representative tuple and the duplicate counts for each distinct set
+ * of grouping columns. We compute the hash key from the grouping columns.
+ */
+typedef struct SetOpHashEntryData *SetOpHashEntry;
+
+typedef struct SetOpHashEntryData
+{
+ TupleHashEntryData shared; /* common header for hash table entries */
+ SetOpStatePerGroupData pergroup;
+} SetOpHashEntryData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static void
+initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+ pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static void
+advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup,
+ TupleTableSlot *inputslot)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+ int flag;
+ bool isNull;
+
+ flag = DatumGetInt32(slot_getattr(inputslot,
+ node->flagColIdx,
+ &isNull));
+ Assert(!isNull);
+ if (flag)
+ pergroup->numRight++;
+ else
+ pergroup->numLeft++;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+
+ Assert(node->strategy == SETOP_HASHED);
+ Assert(node->numGroups > 0);
+
+ setopstate->hashtable = BuildTupleHashTable(node->numCols,
+ node->dupColIdx,
+ setopstate->eqfunctions,
+ setopstate->hashfunctions,
+ node->numGroups,
+ sizeof(SetOpHashEntryData),
+ setopstate->tableContext,
+ setopstate->tempContext);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.
+ */
+static SetOpHashEntry
+lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+ SetOpHashEntry entry;
+ bool isnew;
+
+ /* find or create the hashtable entry */
+ entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable,
+ inputslot,
+ &isnew);
+
+ if (isnew)
+ {
+ /* initialize counts for new tuple group */
+ initialize_counts(setopstate, &entry->pergroup);
+ }
+
+ /* Must reset temp context after each hashtable lookup */
+ MemoryContextReset(setopstate->tempContext);
+
+ return entry;
+}
+
+/*
+ * We've completed processing a tuple group. Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+ SetOp *plannode = (SetOp *) setopstate->ps.plan;
+
+ switch (plannode->cmd)
+ {
+ case SETOPCMD_INTERSECT:
+ if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+ setopstate->numOutput = 1;
+ else
+ setopstate->numOutput = 0;
+ break;
+ case SETOPCMD_INTERSECT_ALL:
+ setopstate->numOutput =
+ (pergroup->numLeft < pergroup->numRight) ?
+ pergroup->numLeft : pergroup->numRight;
+ break;
+ case SETOPCMD_EXCEPT:
+ if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+ setopstate->numOutput = 1;
+ else
+ setopstate->numOutput = 0;
+ break;
+ case SETOPCMD_EXCEPT_ALL:
+ setopstate->numOutput =
+ (pergroup->numLeft < pergroup->numRight) ?
+ 0 : (pergroup->numLeft - pergroup->numRight);
+ break;
+ default:
+ elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+ break;
+ }
+}
+
+
/* ----------------------------------------------------------------
* ExecSetOp
* ----------------------------------------------------------------
ExecSetOp(SetOpState *node)
{
SetOp *plannode = (SetOp *) node->ps.plan;
- TupleTableSlot *resultTupleSlot;
- PlanState *outerPlan;
-
- /*
- * get information from the node
- */
- outerPlan = outerPlanState(node);
- resultTupleSlot = node->ps.ps_ResultTupleSlot;
+ TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
/*
* If the previously-returned tuple needs to be returned more than once,
return resultTupleSlot;
}
- /* Flag that we have no current tuple */
- ExecClearTuple(resultTupleSlot);
+ /* Otherwise, we're done if we are out of groups */
+ if (node->setop_done)
+ return NULL;
+
+ /* Fetch the next tuple group according to the correct strategy */
+ if (plannode->strategy == SETOP_HASHED)
+ {
+ if (!node->table_filled)
+ setop_fill_hash_table(node);
+ return setop_retrieve_hash_table(node);
+ }
+ else
+ return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+ PlanState *outerPlan;
+ SetOpStatePerGroup pergroup;
+ TupleTableSlot *outerslot;
+ TupleTableSlot *resultTupleSlot;
/*
- * Absorb groups of duplicate tuples, counting them, and saving the first
- * of each group as a possible return value. At the end of each group,
- * decide whether to return anything.
- *
- * We assume that the tuples arrive in sorted order so we can detect
- * duplicates easily.
+ * get state info from node
*/
- for (;;)
- {
- TupleTableSlot *inputTupleSlot;
- bool endOfGroup;
+ outerPlan = outerPlanState(setopstate);
+ pergroup = setopstate->pergroup;
+ resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+ /*
+ * We loop retrieving groups until we find one we should return
+ */
+ while (!setopstate->setop_done)
+ {
/*
- * fetch a tuple from the outer subplan, unless we already did.
+ * If we don't already have the first tuple of the new group, fetch it
+ * from the outer plan.
*/
- if (node->ps.ps_OuterTupleSlot == NULL &&
- !node->subplan_done)
- {
- node->ps.ps_OuterTupleSlot =
- ExecProcNode(outerPlan);
- if (TupIsNull(node->ps.ps_OuterTupleSlot))
- node->subplan_done = true;
- }
- inputTupleSlot = node->ps.ps_OuterTupleSlot;
-
- if (TupIsNull(resultTupleSlot))
+ if (setopstate->grp_firstTuple == NULL)
{
- /*
- * First of group: save a copy in result slot, and reset
- * duplicate-counters for new group.
- */
- if (node->subplan_done)
- return NULL; /* no more tuples */
- ExecCopySlot(resultTupleSlot, inputTupleSlot);
- node->numLeft = 0;
- node->numRight = 0;
- endOfGroup = false;
- }
- else if (node->subplan_done)
- {
- /*
- * Reached end of input, so finish processing final group
- */
- endOfGroup = true;
- }
- else
- {
- /*
- * Else test if the new tuple and the previously saved tuple
- * match.
- */
- if (execTuplesMatch(inputTupleSlot,
- resultTupleSlot,
- plannode->numCols, plannode->dupColIdx,
- node->eqfunctions,
- node->tempContext))
- endOfGroup = false;
+ outerslot = ExecProcNode(outerPlan);
+ if (!TupIsNull(outerslot))
+ {
+ /* Make a copy of the first input tuple */
+ setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
+ }
else
- endOfGroup = true;
+ {
+ /* outer plan produced no tuples at all */
+ setopstate->setop_done = true;
+ return NULL;
+ }
}
- if (endOfGroup)
+ /*
+ * Store the copied first input tuple in the tuple table slot
+ * reserved for it. The tuple will be deleted when it is cleared
+ * from the slot.
+ */
+ ExecStoreTuple(setopstate->grp_firstTuple,
+ resultTupleSlot,
+ InvalidBuffer,
+ true);
+ setopstate->grp_firstTuple = NULL; /* don't keep two pointers */
+
+ /* Initialize working state for a new input tuple group */
+ initialize_counts(setopstate, pergroup);
+
+ /* Count the first input tuple */
+ advance_counts(setopstate, pergroup, resultTupleSlot);
+
+ /*
+ * Scan the outer plan until we exhaust it or cross a group boundary.
+ */
+ for (;;)
{
+ outerslot = ExecProcNode(outerPlan);
+ if (TupIsNull(outerslot))
+ {
+ /* no more outer-plan tuples available */
+ setopstate->setop_done = true;
+ break;
+ }
+
/*
- * We've reached the end of the group containing resultTuple.
- * Decide how many copies (if any) to emit. This logic is
- * straight from the SQL92 specification.
+ * Check whether we've crossed a group boundary.
*/
- switch (plannode->cmd)
+ if (!execTuplesMatch(resultTupleSlot,
+ outerslot,
+ node->numCols, node->dupColIdx,
+ setopstate->eqfunctions,
+ setopstate->tempContext))
{
- case SETOPCMD_INTERSECT:
- if (node->numLeft > 0 && node->numRight > 0)
- node->numOutput = 1;
- else
- node->numOutput = 0;
- break;
- case SETOPCMD_INTERSECT_ALL:
- node->numOutput =
- (node->numLeft < node->numRight) ?
- node->numLeft : node->numRight;
- break;
- case SETOPCMD_EXCEPT:
- if (node->numLeft > 0 && node->numRight == 0)
- node->numOutput = 1;
- else
- node->numOutput = 0;
- break;
- case SETOPCMD_EXCEPT_ALL:
- node->numOutput =
- (node->numLeft < node->numRight) ?
- 0 : (node->numLeft - node->numRight);
- break;
- default:
- elog(ERROR, "unrecognized set op: %d",
- (int) plannode->cmd);
- break;
- }
- /* Fall out of for-loop if we have tuples to emit */
- if (node->numOutput > 0)
+ /*
+ * Save the first input tuple of the next group.
+ */
+ setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
break;
- /* Else flag that we have no current tuple, and loop around */
- ExecClearTuple(resultTupleSlot);
+ }
+
+ /* Still in same group, so count this tuple */
+ advance_counts(setopstate, pergroup, outerslot);
}
- else
+
+ /*
+ * Done scanning input tuple group. See if we should emit any
+ * copies of result tuple, and if so return the first copy.
+ */
+ set_output_count(setopstate, pergroup);
+
+ if (setopstate->numOutput > 0)
{
- /*
- * Current tuple is member of same group as resultTuple. Count it
- * in the appropriate counter.
- */
- int flag;
- bool isNull;
-
- flag = DatumGetInt32(slot_getattr(inputTupleSlot,
- plannode->flagColIdx,
- &isNull));
- Assert(!isNull);
- if (flag)
- node->numRight++;
- else
- node->numLeft++;
- /* Set flag to fetch a new input tuple, and loop around */
- node->ps.ps_OuterTupleSlot = NULL;
+ setopstate->numOutput--;
+ return resultTupleSlot;
}
}
+ /* No more groups */
+ ExecClearTuple(resultTupleSlot);
+ return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+ PlanState *outerPlan;
+ SetOpHashEntry entry;
+ TupleTableSlot *outerslot;
+
+ /*
+ * get state info from node
+ */
+ outerPlan = outerPlanState(setopstate);
+
/*
- * If we fall out of loop, then we need to emit at least one copy of
- * resultTuple.
+ * Process each outer-plan tuple, and then fetch the next one, until we
+ * exhaust the outer plan.
*/
- Assert(node->numOutput > 0);
- node->numOutput--;
- return resultTupleSlot;
+ for (;;)
+ {
+ outerslot = ExecProcNode(outerPlan);
+ if (TupIsNull(outerslot))
+ break;
+
+ /* Find or build hashtable entry for this tuple's group */
+ entry = lookup_hash_entry(setopstate, outerslot);
+
+ /* Advance the counts */
+ advance_counts(setopstate, &entry->pergroup, outerslot);
+ }
+
+ setopstate->table_filled = true;
+ /* Initialize to walk the hash table */
+ ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+ SetOpHashEntry entry;
+ TupleTableSlot *resultTupleSlot;
+
+ /*
+ * get state info from node
+ */
+ resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+ /*
+ * We loop retrieving groups until we find one we should return
+ */
+ while (!setopstate->setop_done)
+ {
+ /*
+ * Find the next entry in the hash table
+ */
+ entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter);
+ if (entry == NULL)
+ {
+ /* No more entries in hashtable, so done */
+ setopstate->setop_done = true;
+ return NULL;
+ }
+
+ /*
+ * See if we should emit any copies of this tuple, and if so return
+ * the first copy.
+ */
+ set_output_count(setopstate, &entry->pergroup);
+
+ if (setopstate->numOutput > 0)
+ {
+ setopstate->numOutput--;
+ return ExecStoreMinimalTuple(entry->shared.firstTuple,
+ resultTupleSlot,
+ false);
+ }
+ }
+
+ /* No more groups */
+ ExecClearTuple(resultTupleSlot);
+ return NULL;
}
/* ----------------------------------------------------------------
setopstate->ps.plan = (Plan *) node;
setopstate->ps.state = estate;
- setopstate->ps.ps_OuterTupleSlot = NULL;
- setopstate->subplan_done = false;
+ setopstate->eqfunctions = NULL;
+ setopstate->hashfunctions = NULL;
+ setopstate->setop_done = false;
setopstate->numOutput = 0;
+ setopstate->pergroup = NULL;
+ setopstate->grp_firstTuple = NULL;
+ setopstate->hashtable = NULL;
+ setopstate->tableContext = NULL;
/*
* Miscellaneous initialization
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
+ /*
+ * If hashing, we also need a longer-lived context to store the hash
+ * table. The table can't just be kept in the per-query context because
+ * we want to be able to throw it away in ExecReScanSetOp.
+ */
+ if (node->strategy == SETOP_HASHED)
+ setopstate->tableContext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "SetOp hash table",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
#define SETOP_NSLOTS 1
/*
ExecInitResultTupleSlot(estate, &setopstate->ps);
/*
- * then initialize outer plan
+ * initialize child nodes
+ *
+ * If we are hashing then the child plan does not need
+ * to handle REWIND efficiently; see ExecReScanSetOp.
*/
+ if (node->strategy == SETOP_HASHED)
+ eflags &= ~EXEC_FLAG_REWIND;
outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
/*
setopstate->ps.ps_ProjInfo = NULL;
/*
- * Precompute fmgr lookup data for inner loop
+ * Precompute fmgr lookup data for inner loop. We need both equality and
+ * hashing functions to do it by hashing, but only equality if not
+ * hashing.
*/
- setopstate->eqfunctions =
- execTuplesMatchPrepare(node->numCols,
- node->dupOperators);
+ if (node->strategy == SETOP_HASHED)
+ execTuplesHashPrepare(node->numCols,
+ node->dupOperators,
+ &setopstate->eqfunctions,
+ &setopstate->hashfunctions);
+ else
+ setopstate->eqfunctions =
+ execTuplesMatchPrepare(node->numCols,
+ node->dupOperators);
+
+ if (node->strategy == SETOP_HASHED)
+ {
+ build_hash_table(setopstate);
+ setopstate->table_filled = false;
+ }
+ else
+ {
+ setopstate->pergroup =
+ (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+ }
return setopstate;
}
{
/* clean up tuple table */
ExecClearTuple(node->ps.ps_ResultTupleSlot);
- node->ps.ps_OuterTupleSlot = NULL;
+ /* free subsidiary stuff including hashtable */
MemoryContextDelete(node->tempContext);
+ if (node->tableContext)
+ MemoryContextDelete(node->tableContext);
ExecEndNode(outerPlanState(node));
}
ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt)
{
ExecClearTuple(node->ps.ps_ResultTupleSlot);
- node->ps.ps_OuterTupleSlot = NULL;
- node->subplan_done = false;
+ node->setop_done = false;
node->numOutput = 0;
+ if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+ {
+ /*
+ * In the hashed case, if we haven't yet built the hash table then we
+ * can just return; nothing done yet, so nothing to undo. If subnode's
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else no reason to re-scan it at all.
+ */
+ if (!node->table_filled)
+ return;
+
+ /*
+ * If we do have the hash table and the subplan does not have any
+ * parameter changes, then we can just rescan the existing hash table;
+ * no need to build it again.
+ */
+ if (((PlanState *) node)->lefttree->chgParam == NULL)
+ {
+ ResetTupleHashIterator(node->hashtable, &node->hashiter);
+ return;
+ }
+ }
+
+ /* Release first tuple of group, if we have made a copy */
+ if (node->grp_firstTuple != NULL)
+ {
+ heap_freetuple(node->grp_firstTuple);
+ node->grp_firstTuple = NULL;
+ }
+
+ /* Release any hashtable storage */
+ if (node->tableContext)
+ MemoryContextResetAndDeleteChildren(node->tableContext);
+
+ /* And rebuild empty hashtable if needed */
+ if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+ {
+ build_hash_table(node);
+ node->table_filled = false;
+ }
+
/*
* if chgParam of subnode is not null then plan will be re-scanned by
* first ExecProcNode.