1 /*-------------------------------------------------------------------------
4 * Routines to handle hash join nodes
6 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.58 2003/11/25 21:00:52 tgl Exp $
13 *-------------------------------------------------------------------------
18 #include "executor/executor.h"
19 #include "executor/nodeHash.h"
20 #include "executor/nodeHashjoin.h"
21 #include "optimizer/clauses.h"
22 #include "utils/memutils.h"
25 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *node,
26 HashJoinState *hjstate);
27 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
29 TupleTableSlot *tupleSlot);
30 static int ExecHashJoinNewBatch(HashJoinState *hjstate);
33 /* ----------------------------------------------------------------
36 * This function implements the Hybrid Hashjoin algorithm.
37 * recursive partitioning remains to be added.
38 * Note: the relation we build hash table on is the inner
39 * the other one is outer.
40 * ----------------------------------------------------------------
42 TupleTableSlot * /* return: a tuple or NULL */
43 ExecHashJoin(HashJoinState *node)
53 TupleTableSlot *inntuple;
54 ExprContext *econtext;
56 HashJoinTable hashtable;
58 TupleTableSlot *outerTupleSlot;
62 * get information from HashJoin node
64 hjclauses = node->hashclauses;
65 estate = node->js.ps.state;
66 joinqual = node->js.joinqual;
67 otherqual = node->js.ps.qual;
68 hashNode = (HashState *) innerPlanState(node);
69 outerNode = outerPlanState(node);
70 dir = estate->es_direction;
73 * get information from HashJoin state
75 hashtable = node->hj_HashTable;
76 outerkeys = node->hj_OuterHashKeys;
77 econtext = node->js.ps.ps_ExprContext;
80 * Check to see if we're still projecting out tuples from a previous
81 * join tuple (because there is a function-returning-set in the
82 * projection expressions). If so, try to project another one.
84 if (node->js.ps.ps_TupFromTlist)
86 TupleTableSlot *result;
88 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
89 if (isDone == ExprMultipleResult)
91 /* Done with that source tuple... */
92 node->js.ps.ps_TupFromTlist = false;
96 * If we're doing an IN join, we want to return at most one row per
97 * outer tuple; so we can stop scanning the inner scan if we matched
98 * on the previous try.
100 if (node->js.jointype == JOIN_IN &&
101 node->hj_MatchedOuter)
102 node->hj_NeedNewOuter = true;
105 * Reset per-tuple memory context to free any expression evaluation
106 * storage allocated in the previous tuple cycle. Note this can't
107 * happen until we're done projecting out tuples from a join tuple.
109 ResetExprContext(econtext);
112 * if this is the first call, build the hash table for inner relation
114 if (!node->hj_hashdone)
117 * create the hash table
119 Assert(hashtable == NULL);
120 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
121 node->hj_HashOperators);
122 node->hj_HashTable = hashtable;
125 * execute the Hash node, to build the hash table
127 hashNode->hashtable = hashtable;
128 (void) ExecProcNode((PlanState *) hashNode);
131 * Open temp files for outer batches, if needed. Note that file
132 * buffers are palloc'd in regular executor context.
134 for (i = 0; i < hashtable->nbatch; i++)
135 hashtable->outerBatchFile[i] = BufFileCreateTemp(false);
137 node->hj_hashdone = true;
141 * Now get an outer tuple and probe into the hash table for matches
143 outerTupleSlot = node->js.ps.ps_OuterTupleSlot;
148 * If we don't have an outer tuple, get the next one
150 if (node->hj_NeedNewOuter)
152 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
154 if (TupIsNull(outerTupleSlot))
160 node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
161 econtext->ecxt_outertuple = outerTupleSlot;
162 node->hj_NeedNewOuter = false;
163 node->hj_MatchedOuter = false;
166 * now we have an outer tuple, find the corresponding bucket
167 * for this tuple from the hash table
169 node->hj_CurBucketNo = ExecHashGetBucket(hashtable, econtext,
171 node->hj_CurTuple = NULL;
174 * Now we've got an outer tuple and the corresponding hash
175 * bucket, but this tuple may not belong to the current batch.
176 * This need only be checked in the first pass.
178 if (hashtable->curbatch == 0)
180 int batchno = ExecHashGetBatch(node->hj_CurBucketNo,
186 * Need to postpone this outer tuple to a later batch.
187 * Save it in the corresponding outer-batch file.
189 hashtable->outerBatchSize[batchno]++;
190 ExecHashJoinSaveTuple(outerTupleSlot->val,
191 hashtable->outerBatchFile[batchno]);
192 node->hj_NeedNewOuter = true;
193 continue; /* loop around for a new outer tuple */
199 * OK, scan the selected hash bucket for matches
203 curtuple = ExecScanHashBucket(node,
206 if (curtuple == NULL)
207 break; /* out of matches */
210 * we've got a match, but still need to test non-hashed quals
212 inntuple = ExecStoreTuple(curtuple,
213 node->hj_HashTupleSlot,
215 false); /* don't pfree this tuple */
216 econtext->ecxt_innertuple = inntuple;
218 /* reset temp memory each time to avoid leaks from qual expr */
219 ResetExprContext(econtext);
222 * if we pass the qual, then save state for next call and have
223 * ExecProject form the projection, store it in the tuple
224 * table, and return the slot.
226 * Only the joinquals determine MatchedOuter status, but all
227 * quals must pass to actually return the tuple.
229 if (ExecQual(joinqual, econtext, false))
231 node->hj_MatchedOuter = true;
233 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
235 TupleTableSlot *result;
237 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
239 if (isDone != ExprEndResult)
241 node->js.ps.ps_TupFromTlist =
242 (isDone == ExprMultipleResult);
248 * If we didn't return a tuple, may need to set
251 if (node->js.jointype == JOIN_IN)
253 node->hj_NeedNewOuter = true;
254 break; /* out of loop over hash bucket */
260 * Now the current outer tuple has run out of matches, so check
261 * whether to emit a dummy outer-join tuple. If not, loop around
262 * to get a new outer tuple.
264 node->hj_NeedNewOuter = true;
266 if (!node->hj_MatchedOuter &&
267 node->js.jointype == JOIN_LEFT)
270 * We are doing an outer join and there were no join matches
271 * for this outer tuple. Generate a fake join tuple with
272 * nulls for the inner tuple, and return it if it passes the
275 econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
277 if (ExecQual(otherqual, econtext, false))
280 * qualification was satisfied so we project and return
281 * the slot containing the result tuple using
284 TupleTableSlot *result;
286 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
288 if (isDone != ExprEndResult)
290 node->js.ps.ps_TupFromTlist =
291 (isDone == ExprMultipleResult);
299 /* ----------------------------------------------------------------
302 * Init routine for HashJoin node.
303 * ----------------------------------------------------------------
306 ExecInitHashJoin(HashJoin *node, EState *estate)
308 HashJoinState *hjstate;
317 * create state structure
319 hjstate = makeNode(HashJoinState);
320 hjstate->js.ps.plan = (Plan *) node;
321 hjstate->js.ps.state = estate;
324 * Miscellaneous initialization
326 * create expression context for node
328 ExecAssignExprContext(estate, &hjstate->js.ps);
331 * initialize child expressions
333 hjstate->js.ps.targetlist = (List *)
334 ExecInitExpr((Expr *) node->join.plan.targetlist,
335 (PlanState *) hjstate);
336 hjstate->js.ps.qual = (List *)
337 ExecInitExpr((Expr *) node->join.plan.qual,
338 (PlanState *) hjstate);
339 hjstate->js.jointype = node->join.jointype;
340 hjstate->js.joinqual = (List *)
341 ExecInitExpr((Expr *) node->join.joinqual,
342 (PlanState *) hjstate);
343 hjstate->hashclauses = (List *)
344 ExecInitExpr((Expr *) node->hashclauses,
345 (PlanState *) hjstate);
348 * initialize child nodes
350 outerNode = outerPlan(node);
351 hashNode = (Hash *) innerPlan(node);
353 outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
354 innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
356 #define HASHJOIN_NSLOTS 3
359 * tuple table initialization
361 ExecInitResultTupleSlot(estate, &hjstate->js.ps);
362 hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
364 switch (node->join.jointype)
370 hjstate->hj_NullInnerTupleSlot =
371 ExecInitNullTupleSlot(estate,
372 ExecGetResultType(innerPlanState(hjstate)));
375 elog(ERROR, "unrecognized join type: %d",
376 (int) node->join.jointype);
380 * now for some voodoo. our temporary tuple slot is actually the
381 * result tuple slot of the Hash node (which is our inner plan). we
382 * do this because Hash nodes don't return tuples via ExecProcNode()
383 * -- instead the hash join node uses ExecScanHashBucket() to get at
384 * the contents of the hash table. -cim 6/9/91
387 HashState *hashstate = (HashState *) innerPlanState(hjstate);
388 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
390 hjstate->hj_HashTupleSlot = slot;
394 * initialize tuple type and projection info
396 ExecAssignResultTypeFromTL(&hjstate->js.ps);
397 ExecAssignProjectionInfo(&hjstate->js.ps);
399 ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
400 ExecGetResultType(outerPlanState(hjstate)),
404 * initialize hash-specific info
407 hjstate->hj_hashdone = false;
408 hjstate->hj_HashTable = (HashJoinTable) NULL;
410 hjstate->hj_CurBucketNo = 0;
411 hjstate->hj_CurTuple = (HashJoinTuple) NULL;
414 * Deconstruct the hash clauses into outer and inner argument values,
415 * so that we can evaluate those subexpressions separately. Also make
416 * a list of the hash operator OIDs, in preparation for looking up the
417 * hash functions to use.
422 foreach(hcl, hjstate->hashclauses)
424 FuncExprState *fstate = (FuncExprState *) lfirst(hcl);
427 Assert(IsA(fstate, FuncExprState));
428 hclause = (OpExpr *) fstate->xprstate.expr;
429 Assert(IsA(hclause, OpExpr));
430 lclauses = lappend(lclauses, lfirst(fstate->args));
431 rclauses = lappend(rclauses, lsecond(fstate->args));
432 hoperators = lappendo(hoperators, hclause->opno);
434 hjstate->hj_OuterHashKeys = lclauses;
435 hjstate->hj_InnerHashKeys = rclauses;
436 hjstate->hj_HashOperators = hoperators;
437 /* child Hash node needs to evaluate inner hash keys, too */
438 ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
440 hjstate->js.ps.ps_OuterTupleSlot = NULL;
441 hjstate->js.ps.ps_TupFromTlist = false;
442 hjstate->hj_NeedNewOuter = true;
443 hjstate->hj_MatchedOuter = false;
449 ExecCountSlotsHashJoin(HashJoin *node)
451 return ExecCountSlotsNode(outerPlan(node)) +
452 ExecCountSlotsNode(innerPlan(node)) +
456 /* ----------------------------------------------------------------
459 * clean up routine for HashJoin node
460 * ----------------------------------------------------------------
463 ExecEndHashJoin(HashJoinState *node)
468 if (node->hj_HashTable)
470 ExecHashTableDestroy(node->hj_HashTable);
471 node->hj_HashTable = NULL;
475 * Free the exprcontext
477 ExecFreeExprContext(&node->js.ps);
480 * clean out the tuple table
482 ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
483 ExecClearTuple(node->hj_OuterTupleSlot);
484 ExecClearTuple(node->hj_HashTupleSlot);
489 ExecEndNode(outerPlanState(node));
490 ExecEndNode(innerPlanState(node));
493 /* ----------------------------------------------------------------
494 * ExecHashJoinOuterGetTuple
496 * get the next outer tuple for hashjoin: either by
497 * executing a plan node as in the first pass, or from
498 * the tmp files for the hashjoin batches.
499 * ----------------------------------------------------------------
502 static TupleTableSlot *
503 ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate)
505 HashJoinTable hashtable = hjstate->hj_HashTable;
506 int curbatch = hashtable->curbatch;
507 TupleTableSlot *slot;
510 { /* if it is the first pass */
511 slot = ExecProcNode(node);
512 if (!TupIsNull(slot))
516 * We have just reached the end of the first pass. Try to switch
519 curbatch = ExecHashJoinNewBatch(hjstate);
523 * Try to read from a temp file. Loop allows us to advance to new
526 while (curbatch <= hashtable->nbatch)
528 slot = ExecHashJoinGetSavedTuple(hjstate,
529 hashtable->outerBatchFile[curbatch - 1],
530 hjstate->hj_OuterTupleSlot);
531 if (!TupIsNull(slot))
533 curbatch = ExecHashJoinNewBatch(hjstate);
536 /* Out of batches... */
540 /* ----------------------------------------------------------------
541 * ExecHashJoinGetSavedTuple
543 * read the next tuple from a tmp file
544 * ----------------------------------------------------------------
547 static TupleTableSlot *
548 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
550 TupleTableSlot *tupleSlot)
556 nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
558 return NULL; /* end of file */
559 if (nread != sizeof(HeapTupleData))
561 (errcode_for_file_access(),
562 errmsg("could not read from hash-join temporary file: %m")));
563 heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
564 memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
565 heapTuple->t_datamcxt = CurrentMemoryContext;
566 heapTuple->t_data = (HeapTupleHeader)
567 ((char *) heapTuple + HEAPTUPLESIZE);
568 nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
569 if (nread != (size_t) htup.t_len)
571 (errcode_for_file_access(),
572 errmsg("could not read from hash-join temporary file: %m")));
573 return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
576 /* ----------------------------------------------------------------
577 * ExecHashJoinNewBatch
579 * switch to a new hashjoin batch
580 * ----------------------------------------------------------------
583 ExecHashJoinNewBatch(HashJoinState *hjstate)
585 HashJoinTable hashtable = hjstate->hj_HashTable;
586 int nbatch = hashtable->nbatch;
587 int newbatch = hashtable->curbatch + 1;
588 long *innerBatchSize = hashtable->innerBatchSize;
589 long *outerBatchSize = hashtable->outerBatchSize;
591 TupleTableSlot *slot;
592 ExprContext *econtext;
598 * We no longer need the previous outer batch file; close it right
599 * away to free disk space.
601 BufFileClose(hashtable->outerBatchFile[newbatch - 2]);
602 hashtable->outerBatchFile[newbatch - 2] = NULL;
606 * We can skip over any batches that are empty on either side. Release
607 * associated temp files right away.
609 while (newbatch <= nbatch &&
610 (innerBatchSize[newbatch - 1] == 0L ||
611 outerBatchSize[newbatch - 1] == 0L))
613 BufFileClose(hashtable->innerBatchFile[newbatch - 1]);
614 hashtable->innerBatchFile[newbatch - 1] = NULL;
615 BufFileClose(hashtable->outerBatchFile[newbatch - 1]);
616 hashtable->outerBatchFile[newbatch - 1] = NULL;
620 if (newbatch > nbatch)
621 return newbatch; /* no more batches */
624 * Rewind inner and outer batch files for this batch, so that we can
625 * start reading them.
627 if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET))
629 (errcode_for_file_access(),
630 errmsg("could not rewind hash-join temporary file: %m")));
632 innerFile = hashtable->innerBatchFile[newbatch - 1];
634 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
636 (errcode_for_file_access(),
637 errmsg("could not rewind hash-join temporary file: %m")));
640 * Reload the hash table with the new inner batch
642 ExecHashTableReset(hashtable, innerBatchSize[newbatch - 1]);
644 econtext = hjstate->js.ps.ps_ExprContext;
645 innerhashkeys = hjstate->hj_InnerHashKeys;
647 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
649 hjstate->hj_HashTupleSlot))
652 econtext->ecxt_innertuple = slot;
653 ExecHashTableInsert(hashtable, econtext, innerhashkeys);
657 * after we build the hash table, the inner batch file is no longer
660 BufFileClose(innerFile);
661 hashtable->innerBatchFile[newbatch - 1] = NULL;
663 hashtable->curbatch = newbatch;
667 /* ----------------------------------------------------------------
668 * ExecHashJoinSaveTuple
670 * save a tuple to a tmp file.
672 * The data recorded in the file for each tuple is an image of its
673 * HeapTupleData (with meaningless t_data pointer) followed by the
674 * HeapTupleHeader and tuple data.
675 * ----------------------------------------------------------------
679 ExecHashJoinSaveTuple(HeapTuple heapTuple,
684 written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
685 if (written != sizeof(HeapTupleData))
687 (errcode_for_file_access(),
688 errmsg("could not write to hash-join temporary file: %m")));
689 written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
690 if (written != (size_t) heapTuple->t_len)
692 (errcode_for_file_access(),
693 errmsg("could not write to hash-join temporary file: %m")));
697 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
700 * If we haven't yet built the hash table then we can just return;
701 * nothing done yet, so nothing to undo.
703 if (!node->hj_hashdone)
705 Assert(node->hj_HashTable != NULL);
708 * In a multi-batch join, we currently have to do rescans the hard
709 * way, primarily because batch temp files may have already been
710 * released. But if it's a single-batch join, and there is no
711 * parameter change for the inner subnode, then we can just re-use the
712 * existing hash table without rebuilding it.
714 if (node->hj_HashTable->nbatch == 0 &&
715 ((PlanState *) node)->righttree->chgParam == NULL)
717 /* okay to reuse the hash table; needn't rescan inner, either */
721 /* must destroy and rebuild hash table */
722 node->hj_hashdone = false;
723 ExecHashTableDestroy(node->hj_HashTable);
724 node->hj_HashTable = NULL;
727 * if chgParam of subnode is not null then plan will be re-scanned
728 * by first ExecProcNode.
730 if (((PlanState *) node)->righttree->chgParam == NULL)
731 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
734 /* Always reset intra-tuple state */
735 node->hj_CurBucketNo = 0;
736 node->hj_CurTuple = (HashJoinTuple) NULL;
738 node->js.ps.ps_OuterTupleSlot = (TupleTableSlot *) NULL;
739 node->js.ps.ps_TupFromTlist = false;
740 node->hj_NeedNewOuter = true;
741 node->hj_MatchedOuter = false;
744 * if chgParam of subnode is not null then plan will be re-scanned by
745 * first ExecProcNode.
747 if (((PlanState *) node)->lefttree->chgParam == NULL)
748 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);