1 /*-------------------------------------------------------------------------
4 * Routines to handle hash join nodes
6 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.86 2007/01/05 22:19:28 momjian Exp $
13 *-------------------------------------------------------------------------
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "utils/memutils.h"
25 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
26 HashJoinState *hjstate,
28 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
31 TupleTableSlot *tupleSlot);
32 static int ExecHashJoinNewBatch(HashJoinState *hjstate);
35 /* ----------------------------------------------------------------
38 * This function implements the Hybrid Hashjoin algorithm.
40 * Note: the relation we build hash table on is the "inner"
41 * the other one is "outer".
42 * ----------------------------------------------------------------
44 TupleTableSlot * /* return: a tuple or NULL */
45 ExecHashJoin(HashJoinState *node)
52 TupleTableSlot *inntuple;
53 ExprContext *econtext;
55 HashJoinTable hashtable;
56 HashJoinTuple curtuple;
57 TupleTableSlot *outerTupleSlot;
62 * get information from HashJoin node
64 estate = node->js.ps.state;
65 joinqual = node->js.joinqual;
66 otherqual = node->js.ps.qual;
67 hashNode = (HashState *) innerPlanState(node);
68 outerNode = outerPlanState(node);
71 * get information from HashJoin state
73 hashtable = node->hj_HashTable;
74 econtext = node->js.ps.ps_ExprContext;
77 * Check to see if we're still projecting out tuples from a previous join
78 * tuple (because there is a function-returning-set in the projection
79 * expressions). If so, try to project another one.
81 if (node->js.ps.ps_TupFromTlist)
83 TupleTableSlot *result;
85 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
86 if (isDone == ExprMultipleResult)
88 /* Done with that source tuple... */
89 node->js.ps.ps_TupFromTlist = false;
93 * If we're doing an IN join, we want to return at most one row per outer
94 * tuple; so we can stop scanning the inner scan if we matched on the
97 if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
98 node->hj_NeedNewOuter = true;
101 * Reset per-tuple memory context to free any expression evaluation
102 * storage allocated in the previous tuple cycle. Note this can't happen
103 * until we're done projecting out tuples from a join tuple.
105 ResetExprContext(econtext);
108 * if this is the first call, build the hash table for inner relation
110 if (hashtable == NULL)
113 * If the outer relation is completely empty, we can quit without
114 * building the hash table. However, for an inner join it is only a
115 * win to check this when the outer relation's startup cost is less
116 * than the projected cost of building the hash table. Otherwise it's
117 * best to build the hash table first and see if the inner relation is
118 * empty. (When it's an outer join, we should always make this check,
119 * since we aren't going to be able to skip the join on the strength
120 * of an empty inner relation anyway.)
122 * If we are rescanning the join, we make use of information gained on
123 * the previous scan: don't bother to try the prefetch if the previous
124 * scan found the outer relation nonempty. This is not 100% reliable
125 * since with new parameters the outer relation might yield different
126 * results, but it's a good heuristic.
128 * The only way to make the check is to try to fetch a tuple from the
129 * outer plan node. If we succeed, we have to stash it away for later
130 * consumption by ExecHashJoinOuterGetTuple.
132 if (node->js.jointype == JOIN_LEFT ||
133 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
134 !node->hj_OuterNotEmpty))
136 node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
137 if (TupIsNull(node->hj_FirstOuterTupleSlot))
139 node->hj_OuterNotEmpty = false;
143 node->hj_OuterNotEmpty = true;
146 node->hj_FirstOuterTupleSlot = NULL;
149 * create the hash table
151 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
152 node->hj_HashOperators);
153 node->hj_HashTable = hashtable;
156 * execute the Hash node, to build the hash table
158 hashNode->hashtable = hashtable;
159 (void) MultiExecProcNode((PlanState *) hashNode);
162 * If the inner relation is completely empty, and we're not doing an
163 * outer join, we can quit without scanning the outer relation.
165 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
169 * need to remember whether nbatch has increased since we began
170 * scanning the outer relation
172 hashtable->nbatch_outstart = hashtable->nbatch;
175 * Reset OuterNotEmpty for scan. (It's OK if we fetched a tuple
176 * above, because ExecHashJoinOuterGetTuple will immediately set it
179 node->hj_OuterNotEmpty = false;
183 * run the hash join process
188 * If we don't have an outer tuple, get the next one
190 if (node->hj_NeedNewOuter)
192 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
195 if (TupIsNull(outerTupleSlot))
201 node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
202 econtext->ecxt_outertuple = outerTupleSlot;
203 node->hj_NeedNewOuter = false;
204 node->hj_MatchedOuter = false;
207 * now we have an outer tuple, find the corresponding bucket for
208 * this tuple from the hash table
210 node->hj_CurHashValue = hashvalue;
211 ExecHashGetBucketAndBatch(hashtable, hashvalue,
212 &node->hj_CurBucketNo, &batchno);
213 node->hj_CurTuple = NULL;
216 * Now we've got an outer tuple and the corresponding hash bucket,
217 * but this tuple may not belong to the current batch.
219 if (batchno != hashtable->curbatch)
222 * Need to postpone this outer tuple to a later batch. Save it
223 * in the corresponding outer-batch file.
225 Assert(batchno > hashtable->curbatch);
226 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
228 &hashtable->outerBatchFile[batchno]);
229 node->hj_NeedNewOuter = true;
230 continue; /* loop around for a new outer tuple */
235 * OK, scan the selected hash bucket for matches
239 curtuple = ExecScanHashBucket(node, econtext);
240 if (curtuple == NULL)
241 break; /* out of matches */
244 * we've got a match, but still need to test non-hashed quals
246 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
247 node->hj_HashTupleSlot,
248 false); /* don't pfree */
249 econtext->ecxt_innertuple = inntuple;
251 /* reset temp memory each time to avoid leaks from qual expr */
252 ResetExprContext(econtext);
255 * if we pass the qual, then save state for next call and have
256 * ExecProject form the projection, store it in the tuple table,
257 * and return the slot.
259 * Only the joinquals determine MatchedOuter status, but all quals
260 * must pass to actually return the tuple.
262 if (joinqual == NIL || ExecQual(joinqual, econtext, false))
264 node->hj_MatchedOuter = true;
266 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
268 TupleTableSlot *result;
270 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
272 if (isDone != ExprEndResult)
274 node->js.ps.ps_TupFromTlist =
275 (isDone == ExprMultipleResult);
281 * If we didn't return a tuple, may need to set NeedNewOuter
283 if (node->js.jointype == JOIN_IN)
285 node->hj_NeedNewOuter = true;
286 break; /* out of loop over hash bucket */
292 * Now the current outer tuple has run out of matches, so check
293 * whether to emit a dummy outer-join tuple. If not, loop around to
294 * get a new outer tuple.
296 node->hj_NeedNewOuter = true;
298 if (!node->hj_MatchedOuter &&
299 node->js.jointype == JOIN_LEFT)
302 * We are doing an outer join and there were no join matches for
303 * this outer tuple. Generate a fake join tuple with nulls for
304 * the inner tuple, and return it if it passes the non-join quals.
306 econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
308 if (ExecQual(otherqual, econtext, false))
311 * qualification was satisfied so we project and return the
312 * slot containing the result tuple using ExecProject().
314 TupleTableSlot *result;
316 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
318 if (isDone != ExprEndResult)
320 node->js.ps.ps_TupFromTlist =
321 (isDone == ExprMultipleResult);
329 /* ----------------------------------------------------------------
332 * Init routine for HashJoin node.
333 * ----------------------------------------------------------------
336 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
338 HashJoinState *hjstate;
346 /* check for unsupported flags */
347 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
350 * create state structure
352 hjstate = makeNode(HashJoinState);
353 hjstate->js.ps.plan = (Plan *) node;
354 hjstate->js.ps.state = estate;
357 * Miscellaneous initialization
359 * create expression context for node
361 ExecAssignExprContext(estate, &hjstate->js.ps);
364 * initialize child expressions
366 hjstate->js.ps.targetlist = (List *)
367 ExecInitExpr((Expr *) node->join.plan.targetlist,
368 (PlanState *) hjstate);
369 hjstate->js.ps.qual = (List *)
370 ExecInitExpr((Expr *) node->join.plan.qual,
371 (PlanState *) hjstate);
372 hjstate->js.jointype = node->join.jointype;
373 hjstate->js.joinqual = (List *)
374 ExecInitExpr((Expr *) node->join.joinqual,
375 (PlanState *) hjstate);
376 hjstate->hashclauses = (List *)
377 ExecInitExpr((Expr *) node->hashclauses,
378 (PlanState *) hjstate);
381 * initialize child nodes
383 * Note: we could suppress the REWIND flag for the inner input, which
384 * would amount to betting that the hash will be a single batch. Not
385 * clear if this would be a win or not.
387 outerNode = outerPlan(node);
388 hashNode = (Hash *) innerPlan(node);
390 outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
391 innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
393 #define HASHJOIN_NSLOTS 3
396 * tuple table initialization
398 ExecInitResultTupleSlot(estate, &hjstate->js.ps);
399 hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
401 switch (node->join.jointype)
407 hjstate->hj_NullInnerTupleSlot =
408 ExecInitNullTupleSlot(estate,
409 ExecGetResultType(innerPlanState(hjstate)));
412 elog(ERROR, "unrecognized join type: %d",
413 (int) node->join.jointype);
417 * now for some voodoo. our temporary tuple slot is actually the result
418 * tuple slot of the Hash node (which is our inner plan). we do this
419 * because Hash nodes don't return tuples via ExecProcNode() -- instead
420 * the hash join node uses ExecScanHashBucket() to get at the contents of
421 * the hash table. -cim 6/9/91
424 HashState *hashstate = (HashState *) innerPlanState(hjstate);
425 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
427 hjstate->hj_HashTupleSlot = slot;
431 * initialize tuple type and projection info
433 ExecAssignResultTypeFromTL(&hjstate->js.ps);
434 ExecAssignProjectionInfo(&hjstate->js.ps);
436 ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
437 ExecGetResultType(outerPlanState(hjstate)));
440 * initialize hash-specific info
442 hjstate->hj_HashTable = NULL;
443 hjstate->hj_FirstOuterTupleSlot = NULL;
445 hjstate->hj_CurHashValue = 0;
446 hjstate->hj_CurBucketNo = 0;
447 hjstate->hj_CurTuple = NULL;
450 * Deconstruct the hash clauses into outer and inner argument values, so
451 * that we can evaluate those subexpressions separately. Also make a list
452 * of the hash operator OIDs, in preparation for looking up the hash
458 foreach(l, hjstate->hashclauses)
460 FuncExprState *fstate = (FuncExprState *) lfirst(l);
463 Assert(IsA(fstate, FuncExprState));
464 hclause = (OpExpr *) fstate->xprstate.expr;
465 Assert(IsA(hclause, OpExpr));
466 lclauses = lappend(lclauses, linitial(fstate->args));
467 rclauses = lappend(rclauses, lsecond(fstate->args));
468 hoperators = lappend_oid(hoperators, hclause->opno);
470 hjstate->hj_OuterHashKeys = lclauses;
471 hjstate->hj_InnerHashKeys = rclauses;
472 hjstate->hj_HashOperators = hoperators;
473 /* child Hash node needs to evaluate inner hash keys, too */
474 ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
476 hjstate->js.ps.ps_OuterTupleSlot = NULL;
477 hjstate->js.ps.ps_TupFromTlist = false;
478 hjstate->hj_NeedNewOuter = true;
479 hjstate->hj_MatchedOuter = false;
480 hjstate->hj_OuterNotEmpty = false;
486 ExecCountSlotsHashJoin(HashJoin *node)
488 return ExecCountSlotsNode(outerPlan(node)) +
489 ExecCountSlotsNode(innerPlan(node)) +
493 /* ----------------------------------------------------------------
496 * clean up routine for HashJoin node
497 * ----------------------------------------------------------------
500 ExecEndHashJoin(HashJoinState *node)
505 if (node->hj_HashTable)
507 ExecHashTableDestroy(node->hj_HashTable);
508 node->hj_HashTable = NULL;
512 * Free the exprcontext
514 ExecFreeExprContext(&node->js.ps);
517 * clean out the tuple table
519 ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
520 ExecClearTuple(node->hj_OuterTupleSlot);
521 ExecClearTuple(node->hj_HashTupleSlot);
526 ExecEndNode(outerPlanState(node));
527 ExecEndNode(innerPlanState(node));
531 * ExecHashJoinOuterGetTuple
533 * get the next outer tuple for hashjoin: either by
534 * executing a plan node in the first pass, or from
535 * the temp files for the hashjoin batches.
537 * Returns a null slot if no more outer tuples. On success, the tuple's
538 * hash value is stored at *hashvalue --- this is either originally computed,
539 * or re-read from the temp file.
541 static TupleTableSlot *
542 ExecHashJoinOuterGetTuple(PlanState *outerNode,
543 HashJoinState *hjstate,
546 HashJoinTable hashtable = hjstate->hj_HashTable;
547 int curbatch = hashtable->curbatch;
548 TupleTableSlot *slot;
551 { /* if it is the first pass */
554 * Check to see if first outer tuple was already fetched by
555 * ExecHashJoin() and not used yet.
557 slot = hjstate->hj_FirstOuterTupleSlot;
558 if (!TupIsNull(slot))
559 hjstate->hj_FirstOuterTupleSlot = NULL;
561 slot = ExecProcNode(outerNode);
562 if (!TupIsNull(slot))
565 * We have to compute the tuple's hash value.
567 ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
569 econtext->ecxt_outertuple = slot;
570 *hashvalue = ExecHashGetHashValue(hashtable, econtext,
571 hjstate->hj_OuterHashKeys);
573 /* remember outer relation is not empty for possible rescan */
574 hjstate->hj_OuterNotEmpty = true;
580 * We have just reached the end of the first pass. Try to switch to a
583 curbatch = ExecHashJoinNewBatch(hjstate);
587 * Try to read from a temp file. Loop allows us to advance to new batches
588 * as needed. NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
589 * don't try to optimize this loop.
591 while (curbatch < hashtable->nbatch)
593 slot = ExecHashJoinGetSavedTuple(hjstate,
594 hashtable->outerBatchFile[curbatch],
596 hjstate->hj_OuterTupleSlot);
597 if (!TupIsNull(slot))
599 curbatch = ExecHashJoinNewBatch(hjstate);
602 /* Out of batches... */
607 * ExecHashJoinNewBatch
608 * switch to a new hashjoin batch
610 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
611 * We will never return a batch number that has an empty outer batch file.
614 ExecHashJoinNewBatch(HashJoinState *hjstate)
616 HashJoinTable hashtable = hjstate->hj_HashTable;
620 TupleTableSlot *slot;
624 nbatch = hashtable->nbatch;
625 curbatch = hashtable->curbatch;
630 * We no longer need the previous outer batch file; close it right
631 * away to free disk space.
633 if (hashtable->outerBatchFile[curbatch])
634 BufFileClose(hashtable->outerBatchFile[curbatch]);
635 hashtable->outerBatchFile[curbatch] = NULL;
639 * We can always skip over any batches that are completely empty on both
640 * sides. We can sometimes skip over batches that are empty on only one
641 * side, but there are exceptions:
643 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
646 * 2. If we have increased nbatch since the initial estimate, we have to
647 * scan inner batches since they might contain tuples that need to be
648 * reassigned to later inner batches.
650 * 3. Similarly, if we have increased nbatch since starting the outer
651 * scan, we have to rescan outer batches in case they contain tuples that
652 * need to be reassigned.
655 while (curbatch < nbatch &&
656 (hashtable->outerBatchFile[curbatch] == NULL ||
657 hashtable->innerBatchFile[curbatch] == NULL))
659 if (hashtable->outerBatchFile[curbatch] &&
660 hjstate->js.jointype == JOIN_LEFT)
661 break; /* must process due to rule 1 */
662 if (hashtable->innerBatchFile[curbatch] &&
663 nbatch != hashtable->nbatch_original)
664 break; /* must process due to rule 2 */
665 if (hashtable->outerBatchFile[curbatch] &&
666 nbatch != hashtable->nbatch_outstart)
667 break; /* must process due to rule 3 */
668 /* We can ignore this batch. */
669 /* Release associated temp files right away. */
670 if (hashtable->innerBatchFile[curbatch])
671 BufFileClose(hashtable->innerBatchFile[curbatch]);
672 hashtable->innerBatchFile[curbatch] = NULL;
673 if (hashtable->outerBatchFile[curbatch])
674 BufFileClose(hashtable->outerBatchFile[curbatch]);
675 hashtable->outerBatchFile[curbatch] = NULL;
679 if (curbatch >= nbatch)
680 return curbatch; /* no more batches */
682 hashtable->curbatch = curbatch;
685 * Reload the hash table with the new inner batch (which could be empty)
687 ExecHashTableReset(hashtable);
689 innerFile = hashtable->innerBatchFile[curbatch];
691 if (innerFile != NULL)
693 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
695 (errcode_for_file_access(),
696 errmsg("could not rewind hash-join temporary file: %m")));
698 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
701 hjstate->hj_HashTupleSlot)))
704 * NOTE: some tuples may be sent to future batches. Also, it is
705 * possible for hashtable->nbatch to be increased here!
707 ExecHashTableInsert(hashtable, slot, hashvalue);
711 * after we build the hash table, the inner batch file is no longer
714 BufFileClose(innerFile);
715 hashtable->innerBatchFile[curbatch] = NULL;
719 * If there's no outer batch file, advance to next batch.
721 if (hashtable->outerBatchFile[curbatch] == NULL)
725 * Rewind outer batch file, so that we can start reading it.
727 if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
729 (errcode_for_file_access(),
730 errmsg("could not rewind hash-join temporary file: %m")));
736 * ExecHashJoinSaveTuple
737 * save a tuple to a batch file.
739 * The data recorded in the file for each tuple is its hash value,
740 * then the tuple in MinimalTuple format.
742 * Note: it is important always to call this in the regular executor
743 * context, not in a shorter-lived context; else the temp file buffers
744 * will get messed up.
747 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
750 BufFile *file = *fileptr;
755 /* First write to this batch file, so open it. */
756 file = BufFileCreateTemp(false);
760 written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
761 if (written != sizeof(uint32))
763 (errcode_for_file_access(),
764 errmsg("could not write to hash-join temporary file: %m")));
766 written = BufFileWrite(file, (void *) tuple, tuple->t_len);
767 if (written != tuple->t_len)
769 (errcode_for_file_access(),
770 errmsg("could not write to hash-join temporary file: %m")));
774 * ExecHashJoinGetSavedTuple
775 * read the next tuple from a batch file. Return NULL if no more.
777 * On success, *hashvalue is set to the tuple's hash value, and the tuple
778 * itself is stored in the given slot.
780 static TupleTableSlot *
781 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
784 TupleTableSlot *tupleSlot)
791 * Since both the hash value and the MinimalTuple length word are uint32,
792 * we can read them both in one BufFileRead() call without any type
795 nread = BufFileRead(file, (void *) header, sizeof(header));
796 if (nread == 0) /* end of file */
798 ExecClearTuple(tupleSlot);
801 if (nread != sizeof(header))
803 (errcode_for_file_access(),
804 errmsg("could not read from hash-join temporary file: %m")));
805 *hashvalue = header[0];
806 tuple = (MinimalTuple) palloc(header[1]);
807 tuple->t_len = header[1];
808 nread = BufFileRead(file,
809 (void *) ((char *) tuple + sizeof(uint32)),
810 header[1] - sizeof(uint32));
811 if (nread != header[1] - sizeof(uint32))
813 (errcode_for_file_access(),
814 errmsg("could not read from hash-join temporary file: %m")));
815 return ExecStoreMinimalTuple(tuple, tupleSlot, true);
820 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
823 * In a multi-batch join, we currently have to do rescans the hard way,
824 * primarily because batch temp files may have already been released. But
825 * if it's a single-batch join, and there is no parameter change for the
826 * inner subnode, then we can just re-use the existing hash table without
829 if (node->hj_HashTable != NULL)
831 if (node->hj_HashTable->nbatch == 1 &&
832 ((PlanState *) node)->righttree->chgParam == NULL)
835 * okay to reuse the hash table; needn't rescan inner, either.
837 * What we do need to do is reset our state about the emptiness of
838 * the outer relation, so that the new scan of the outer will
839 * update it correctly if it turns out to be empty this time.
840 * (There's no harm in clearing it now because ExecHashJoin won't
841 * need the info. In the other cases, where the hash table
842 * doesn't exist or we are destroying it, we leave this state
843 * alone because ExecHashJoin will need it the first time
846 node->hj_OuterNotEmpty = false;
850 /* must destroy and rebuild hash table */
851 ExecHashTableDestroy(node->hj_HashTable);
852 node->hj_HashTable = NULL;
855 * if chgParam of subnode is not null then plan will be re-scanned
856 * by first ExecProcNode.
858 if (((PlanState *) node)->righttree->chgParam == NULL)
859 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
863 /* Always reset intra-tuple state */
864 node->hj_CurHashValue = 0;
865 node->hj_CurBucketNo = 0;
866 node->hj_CurTuple = NULL;
868 node->js.ps.ps_OuterTupleSlot = NULL;
869 node->js.ps.ps_TupFromTlist = false;
870 node->hj_NeedNewOuter = true;
871 node->hj_MatchedOuter = false;
872 node->hj_FirstOuterTupleSlot = NULL;
875 * if chgParam of subnode is not null then plan will be re-scanned by
876 * first ExecProcNode.
878 if (((PlanState *) node)->lefttree->chgParam == NULL)
879 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);