]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c
Optimize multi-batch hash joins when the outer relation has a nonuniform
[postgresql] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.98 2009/03/21 00:04:38 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "utils/memutils.h"
23
24
25 /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
26 #define HASHJOIN_IS_OUTER(hjstate)  ((hjstate)->hj_NullInnerTupleSlot != NULL)
27
28 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
29                                                   HashJoinState *hjstate,
30                                                   uint32 *hashvalue);
31 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
32                                                   BufFile *file,
33                                                   uint32 *hashvalue,
34                                                   TupleTableSlot *tupleSlot);
35 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
36
37
38 /* ----------------------------------------------------------------
39  *              ExecHashJoin
40  *
41  *              This function implements the Hybrid Hashjoin algorithm.
42  *
43  *              Note: the relation we build hash table on is the "inner"
44  *                        the other one is "outer".
45  * ----------------------------------------------------------------
46  */
47 TupleTableSlot *                                /* return: a tuple or NULL */
48 ExecHashJoin(HashJoinState *node)
49 {
50         EState     *estate;
51         PlanState  *outerNode;
52         HashState  *hashNode;
53         List       *joinqual;
54         List       *otherqual;
55         TupleTableSlot *inntuple;
56         ExprContext *econtext;
57         ExprDoneCond isDone;
58         HashJoinTable hashtable;
59         HashJoinTuple curtuple;
60         TupleTableSlot *outerTupleSlot;
61         uint32          hashvalue;
62         int                     batchno;
63
64         /*
65          * get information from HashJoin node
66          */
67         estate = node->js.ps.state;
68         joinqual = node->js.joinqual;
69         otherqual = node->js.ps.qual;
70         hashNode = (HashState *) innerPlanState(node);
71         outerNode = outerPlanState(node);
72
73         /*
74          * get information from HashJoin state
75          */
76         hashtable = node->hj_HashTable;
77         econtext = node->js.ps.ps_ExprContext;
78
79         /*
80          * Check to see if we're still projecting out tuples from a previous join
81          * tuple (because there is a function-returning-set in the projection
82          * expressions).  If so, try to project another one.
83          */
84         if (node->js.ps.ps_TupFromTlist)
85         {
86                 TupleTableSlot *result;
87
88                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
89                 if (isDone == ExprMultipleResult)
90                         return result;
91                 /* Done with that source tuple... */
92                 node->js.ps.ps_TupFromTlist = false;
93         }
94
95         /*
96          * Reset per-tuple memory context to free any expression evaluation
97          * storage allocated in the previous tuple cycle.  Note this can't happen
98          * until we're done projecting out tuples from a join tuple.
99          */
100         ResetExprContext(econtext);
101
102         /*
103          * if this is the first call, build the hash table for inner relation
104          */
105         if (hashtable == NULL)
106         {
107                 /*
108                  * If the outer relation is completely empty, we can quit without
109                  * building the hash table.  However, for an inner join it is only a
110                  * win to check this when the outer relation's startup cost is less
111                  * than the projected cost of building the hash table.  Otherwise it's
112                  * best to build the hash table first and see if the inner relation is
113                  * empty.  (When it's an outer join, we should always make this check,
114                  * since we aren't going to be able to skip the join on the strength
115                  * of an empty inner relation anyway.)
116                  *
117                  * If we are rescanning the join, we make use of information gained on
118                  * the previous scan: don't bother to try the prefetch if the previous
119                  * scan found the outer relation nonempty.      This is not 100% reliable
120                  * since with new parameters the outer relation might yield different
121                  * results, but it's a good heuristic.
122                  *
123                  * The only way to make the check is to try to fetch a tuple from the
124                  * outer plan node.  If we succeed, we have to stash it away for later
125                  * consumption by ExecHashJoinOuterGetTuple.
126                  */
127                 if (HASHJOIN_IS_OUTER(node) ||
128                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
129                          !node->hj_OuterNotEmpty))
130                 {
131                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
132                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
133                         {
134                                 node->hj_OuterNotEmpty = false;
135                                 return NULL;
136                         }
137                         else
138                                 node->hj_OuterNotEmpty = true;
139                 }
140                 else
141                         node->hj_FirstOuterTupleSlot = NULL;
142
143                 /*
144                  * create the hash table
145                  */
146                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
147                                                                                 node->hj_HashOperators);
148                 node->hj_HashTable = hashtable;
149
150                 /*
151                  * execute the Hash node, to build the hash table
152                  */
153                 hashNode->hashtable = hashtable;
154                 (void) MultiExecProcNode((PlanState *) hashNode);
155
156                 /*
157                  * If the inner relation is completely empty, and we're not doing an
158                  * outer join, we can quit without scanning the outer relation.
159                  */
160                 if (hashtable->totalTuples == 0 && !HASHJOIN_IS_OUTER(node))
161                         return NULL;
162
163                 /*
164                  * need to remember whether nbatch has increased since we began
165                  * scanning the outer relation
166                  */
167                 hashtable->nbatch_outstart = hashtable->nbatch;
168
169                 /*
170                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
171                  * above, because ExecHashJoinOuterGetTuple will immediately set it
172                  * again.)
173                  */
174                 node->hj_OuterNotEmpty = false;
175         }
176
177         /*
178          * run the hash join process
179          */
180         for (;;)
181         {
182                 /*
183                  * If we don't have an outer tuple, get the next one
184                  */
185                 if (node->hj_NeedNewOuter)
186                 {
187                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
188                                                                                                            node,
189                                                                                                            &hashvalue);
190                         if (TupIsNull(outerTupleSlot))
191                         {
192                                 /* end of join */
193                                 return NULL;
194                         }
195
196                         econtext->ecxt_outertuple = outerTupleSlot;
197                         node->hj_NeedNewOuter = false;
198                         node->hj_MatchedOuter = false;
199
200                         /*
201                          * Now we have an outer tuple; find the corresponding bucket for
202                          * this tuple in the main hash table or skew hash table.
203                          */
204                         node->hj_CurHashValue = hashvalue;
205                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
206                                                                           &node->hj_CurBucketNo, &batchno);
207                         node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
208                                                                                                                          hashvalue);
209                         node->hj_CurTuple = NULL;
210
211                         /*
212                          * Now we've got an outer tuple and the corresponding hash bucket,
213                          * but it might not belong to the current batch, or it might
214                          * match a skew bucket.
215                          */
216                         if (batchno != hashtable->curbatch &&
217                                 node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
218                         {
219                                 /*
220                                  * Need to postpone this outer tuple to a later batch. Save it
221                                  * in the corresponding outer-batch file.
222                                  */
223                                 Assert(batchno > hashtable->curbatch);
224                                 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
225                                                                           hashvalue,
226                                                                           &hashtable->outerBatchFile[batchno]);
227                                 node->hj_NeedNewOuter = true;
228                                 continue;               /* loop around for a new outer tuple */
229                         }
230                 }
231
232                 /*
233                  * OK, scan the selected hash bucket for matches
234                  */
235                 for (;;)
236                 {
237                         curtuple = ExecScanHashBucket(node, econtext);
238                         if (curtuple == NULL)
239                                 break;                  /* out of matches */
240
241                         /*
242                          * we've got a match, but still need to test non-hashed quals
243                          */
244                         inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
245                                                                                          node->hj_HashTupleSlot,
246                                                                                          false);        /* don't pfree */
247                         econtext->ecxt_innertuple = inntuple;
248
249                         /* reset temp memory each time to avoid leaks from qual expr */
250                         ResetExprContext(econtext);
251
252                         /*
253                          * if we pass the qual, then save state for next call and have
254                          * ExecProject form the projection, store it in the tuple table,
255                          * and return the slot.
256                          *
257                          * Only the joinquals determine MatchedOuter status, but all quals
258                          * must pass to actually return the tuple.
259                          */
260                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
261                         {
262                                 node->hj_MatchedOuter = true;
263
264                                 /* In an antijoin, we never return a matched tuple */
265                                 if (node->js.jointype == JOIN_ANTI)
266                                 {
267                                         node->hj_NeedNewOuter = true;
268                                         break;          /* out of loop over hash bucket */
269                                 }
270
271                                 /*
272                                  * In a semijoin, we'll consider returning the first match,
273                                  * but after that we're done with this outer tuple.
274                                  */
275                                 if (node->js.jointype == JOIN_SEMI)
276                                         node->hj_NeedNewOuter = true;
277
278                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
279                                 {
280                                         TupleTableSlot *result;
281
282                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
283
284                                         if (isDone != ExprEndResult)
285                                         {
286                                                 node->js.ps.ps_TupFromTlist =
287                                                         (isDone == ExprMultipleResult);
288                                                 return result;
289                                         }
290                                 }
291
292                                 /*
293                                  * If semijoin and we didn't return the tuple, we're still
294                                  * done with this outer tuple.
295                                  */
296                                 if (node->js.jointype == JOIN_SEMI)
297                                         break;          /* out of loop over hash bucket */
298                         }
299                 }
300
301                 /*
302                  * Now the current outer tuple has run out of matches, so check
303                  * whether to emit a dummy outer-join tuple. If not, loop around to
304                  * get a new outer tuple.
305                  */
306                 node->hj_NeedNewOuter = true;
307
308                 if (!node->hj_MatchedOuter &&
309                         HASHJOIN_IS_OUTER(node))
310                 {
311                         /*
312                          * We are doing an outer join and there were no join matches for
313                          * this outer tuple.  Generate a fake join tuple with nulls for
314                          * the inner tuple, and return it if it passes the non-join quals.
315                          */
316                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
317
318                         if (otherqual == NIL || ExecQual(otherqual, econtext, false))
319                         {
320                                 /*
321                                  * qualification was satisfied so we project and return the
322                                  * slot containing the result tuple using ExecProject().
323                                  */
324                                 TupleTableSlot *result;
325
326                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
327
328                                 if (isDone != ExprEndResult)
329                                 {
330                                         node->js.ps.ps_TupFromTlist =
331                                                 (isDone == ExprMultipleResult);
332                                         return result;
333                                 }
334                         }
335                 }
336         }
337 }
338
339 /* ----------------------------------------------------------------
340  *              ExecInitHashJoin
341  *
342  *              Init routine for HashJoin node.
343  * ----------------------------------------------------------------
344  */
345 HashJoinState *
346 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
347 {
348         HashJoinState *hjstate;
349         Plan       *outerNode;
350         Hash       *hashNode;
351         List       *lclauses;
352         List       *rclauses;
353         List       *hoperators;
354         ListCell   *l;
355
356         /* check for unsupported flags */
357         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
358
359         /*
360          * create state structure
361          */
362         hjstate = makeNode(HashJoinState);
363         hjstate->js.ps.plan = (Plan *) node;
364         hjstate->js.ps.state = estate;
365
366         /*
367          * Miscellaneous initialization
368          *
369          * create expression context for node
370          */
371         ExecAssignExprContext(estate, &hjstate->js.ps);
372
373         /*
374          * initialize child expressions
375          */
376         hjstate->js.ps.targetlist = (List *)
377                 ExecInitExpr((Expr *) node->join.plan.targetlist,
378                                          (PlanState *) hjstate);
379         hjstate->js.ps.qual = (List *)
380                 ExecInitExpr((Expr *) node->join.plan.qual,
381                                          (PlanState *) hjstate);
382         hjstate->js.jointype = node->join.jointype;
383         hjstate->js.joinqual = (List *)
384                 ExecInitExpr((Expr *) node->join.joinqual,
385                                          (PlanState *) hjstate);
386         hjstate->hashclauses = (List *)
387                 ExecInitExpr((Expr *) node->hashclauses,
388                                          (PlanState *) hjstate);
389
390         /*
391          * initialize child nodes
392          *
393          * Note: we could suppress the REWIND flag for the inner input, which
394          * would amount to betting that the hash will be a single batch.  Not
395          * clear if this would be a win or not.
396          */
397         outerNode = outerPlan(node);
398         hashNode = (Hash *) innerPlan(node);
399
400         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
401         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
402
403 #define HASHJOIN_NSLOTS 3
404
405         /*
406          * tuple table initialization
407          */
408         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
409         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
410
411         /* note: HASHJOIN_IS_OUTER macro depends on this initialization */
412         switch (node->join.jointype)
413         {
414                 case JOIN_INNER:
415                 case JOIN_SEMI:
416                         break;
417                 case JOIN_LEFT:
418                 case JOIN_ANTI:
419                         hjstate->hj_NullInnerTupleSlot =
420                                 ExecInitNullTupleSlot(estate,
421                                                                  ExecGetResultType(innerPlanState(hjstate)));
422                         break;
423                 default:
424                         elog(ERROR, "unrecognized join type: %d",
425                                  (int) node->join.jointype);
426         }
427
428         /*
429          * now for some voodoo.  our temporary tuple slot is actually the result
430          * tuple slot of the Hash node (which is our inner plan).  we do this
431          * because Hash nodes don't return tuples via ExecProcNode() -- instead
432          * the hash join node uses ExecScanHashBucket() to get at the contents of
433          * the hash table.      -cim 6/9/91
434          */
435         {
436                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
437                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
438
439                 hjstate->hj_HashTupleSlot = slot;
440         }
441
442         /*
443          * initialize tuple type and projection info
444          */
445         ExecAssignResultTypeFromTL(&hjstate->js.ps);
446         ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
447
448         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
449                                                   ExecGetResultType(outerPlanState(hjstate)));
450
451         /*
452          * initialize hash-specific info
453          */
454         hjstate->hj_HashTable = NULL;
455         hjstate->hj_FirstOuterTupleSlot = NULL;
456
457         hjstate->hj_CurHashValue = 0;
458         hjstate->hj_CurBucketNo = 0;
459         hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
460         hjstate->hj_CurTuple = NULL;
461
462         /*
463          * Deconstruct the hash clauses into outer and inner argument values, so
464          * that we can evaluate those subexpressions separately.  Also make a list
465          * of the hash operator OIDs, in preparation for looking up the hash
466          * functions to use.
467          */
468         lclauses = NIL;
469         rclauses = NIL;
470         hoperators = NIL;
471         foreach(l, hjstate->hashclauses)
472         {
473                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
474                 OpExpr     *hclause;
475
476                 Assert(IsA(fstate, FuncExprState));
477                 hclause = (OpExpr *) fstate->xprstate.expr;
478                 Assert(IsA(hclause, OpExpr));
479                 lclauses = lappend(lclauses, linitial(fstate->args));
480                 rclauses = lappend(rclauses, lsecond(fstate->args));
481                 hoperators = lappend_oid(hoperators, hclause->opno);
482         }
483         hjstate->hj_OuterHashKeys = lclauses;
484         hjstate->hj_InnerHashKeys = rclauses;
485         hjstate->hj_HashOperators = hoperators;
486         /* child Hash node needs to evaluate inner hash keys, too */
487         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
488
489         hjstate->js.ps.ps_TupFromTlist = false;
490         hjstate->hj_NeedNewOuter = true;
491         hjstate->hj_MatchedOuter = false;
492         hjstate->hj_OuterNotEmpty = false;
493
494         return hjstate;
495 }
496
497 int
498 ExecCountSlotsHashJoin(HashJoin *node)
499 {
500         return ExecCountSlotsNode(outerPlan(node)) +
501                 ExecCountSlotsNode(innerPlan(node)) +
502                 HASHJOIN_NSLOTS;
503 }
504
505 /* ----------------------------------------------------------------
506  *              ExecEndHashJoin
507  *
508  *              clean up routine for HashJoin node
509  * ----------------------------------------------------------------
510  */
511 void
512 ExecEndHashJoin(HashJoinState *node)
513 {
514         /*
515          * Free hash table
516          */
517         if (node->hj_HashTable)
518         {
519                 ExecHashTableDestroy(node->hj_HashTable);
520                 node->hj_HashTable = NULL;
521         }
522
523         /*
524          * Free the exprcontext
525          */
526         ExecFreeExprContext(&node->js.ps);
527
528         /*
529          * clean out the tuple table
530          */
531         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
532         ExecClearTuple(node->hj_OuterTupleSlot);
533         ExecClearTuple(node->hj_HashTupleSlot);
534
535         /*
536          * clean up subtrees
537          */
538         ExecEndNode(outerPlanState(node));
539         ExecEndNode(innerPlanState(node));
540 }
541
542 /*
543  * ExecHashJoinOuterGetTuple
544  *
545  *              get the next outer tuple for hashjoin: either by
546  *              executing a plan node in the first pass, or from
547  *              the temp files for the hashjoin batches.
548  *
549  * Returns a null slot if no more outer tuples.  On success, the tuple's
550  * hash value is stored at *hashvalue --- this is either originally computed,
551  * or re-read from the temp file.
552  */
553 static TupleTableSlot *
554 ExecHashJoinOuterGetTuple(PlanState *outerNode,
555                                                   HashJoinState *hjstate,
556                                                   uint32 *hashvalue)
557 {
558         HashJoinTable hashtable = hjstate->hj_HashTable;
559         int                     curbatch = hashtable->curbatch;
560         TupleTableSlot *slot;
561
562         if (curbatch == 0)                      /* if it is the first pass */
563         {
564                 /*
565                  * Check to see if first outer tuple was already fetched by
566                  * ExecHashJoin() and not used yet.
567                  */
568                 slot = hjstate->hj_FirstOuterTupleSlot;
569                 if (!TupIsNull(slot))
570                         hjstate->hj_FirstOuterTupleSlot = NULL;
571                 else
572                         slot = ExecProcNode(outerNode);
573
574                 while (!TupIsNull(slot))
575                 {
576                         /*
577                          * We have to compute the tuple's hash value.
578                          */
579                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
580
581                         econtext->ecxt_outertuple = slot;
582                         if (ExecHashGetHashValue(hashtable, econtext,
583                                                                          hjstate->hj_OuterHashKeys,
584                                                                          true,          /* outer tuple */
585                                                                          HASHJOIN_IS_OUTER(hjstate),
586                                                                          hashvalue))
587                         {
588                                 /* remember outer relation is not empty for possible rescan */
589                                 hjstate->hj_OuterNotEmpty = true;
590
591                                 return slot;
592                         }
593
594                         /*
595                          * That tuple couldn't match because of a NULL, so discard it and
596                          * continue with the next one.
597                          */
598                         slot = ExecProcNode(outerNode);
599                 }
600
601                 /*
602                  * We have just reached the end of the first pass. Try to switch to a
603                  * saved batch.
604                  */
605                 curbatch = ExecHashJoinNewBatch(hjstate);
606         }
607
608         /*
609          * Try to read from a temp file. Loop allows us to advance to new batches
610          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
611          * don't try to optimize this loop.
612          */
613         while (curbatch < hashtable->nbatch)
614         {
615                 slot = ExecHashJoinGetSavedTuple(hjstate,
616                                                                                  hashtable->outerBatchFile[curbatch],
617                                                                                  hashvalue,
618                                                                                  hjstate->hj_OuterTupleSlot);
619                 if (!TupIsNull(slot))
620                         return slot;
621                 curbatch = ExecHashJoinNewBatch(hjstate);
622         }
623
624         /* Out of batches... */
625         return NULL;
626 }
627
628 /*
629  * ExecHashJoinNewBatch
630  *              switch to a new hashjoin batch
631  *
632  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
633  * We will never return a batch number that has an empty outer batch file.
634  */
635 static int
636 ExecHashJoinNewBatch(HashJoinState *hjstate)
637 {
638         HashJoinTable hashtable = hjstate->hj_HashTable;
639         int                     nbatch;
640         int                     curbatch;
641         BufFile    *innerFile;
642         TupleTableSlot *slot;
643         uint32          hashvalue;
644
645 start_over:
646         nbatch = hashtable->nbatch;
647         curbatch = hashtable->curbatch;
648
649         if (curbatch > 0)
650         {
651                 /*
652                  * We no longer need the previous outer batch file; close it right
653                  * away to free disk space.
654                  */
655                 if (hashtable->outerBatchFile[curbatch])
656                         BufFileClose(hashtable->outerBatchFile[curbatch]);
657                 hashtable->outerBatchFile[curbatch] = NULL;
658         }
659         else                                            /* we just finished the first batch */
660         {
661                 /*
662                  * Reset some of the skew optimization state variables, since we
663                  * no longer need to consider skew tuples after the first batch.
664                  * The memory context reset we are about to do will release the
665                  * skew hashtable itself.
666                  */
667                 hashtable->skewEnabled = false;
668                 hashtable->skewBucket = NULL;
669                 hashtable->skewBucketNums = NULL;
670                 hashtable->spaceUsedSkew = 0;
671         }
672
673         /*
674          * We can always skip over any batches that are completely empty on both
675          * sides.  We can sometimes skip over batches that are empty on only one
676          * side, but there are exceptions:
677          *
678          * 1. In an outer join, we have to process outer batches even if the inner
679          * batch is empty.
680          *
681          * 2. If we have increased nbatch since the initial estimate, we have to
682          * scan inner batches since they might contain tuples that need to be
683          * reassigned to later inner batches.
684          *
685          * 3. Similarly, if we have increased nbatch since starting the outer
686          * scan, we have to rescan outer batches in case they contain tuples that
687          * need to be reassigned.
688          */
689         curbatch++;
690         while (curbatch < nbatch &&
691                    (hashtable->outerBatchFile[curbatch] == NULL ||
692                         hashtable->innerBatchFile[curbatch] == NULL))
693         {
694                 if (hashtable->outerBatchFile[curbatch] &&
695                         HASHJOIN_IS_OUTER(hjstate))
696                         break;                          /* must process due to rule 1 */
697                 if (hashtable->innerBatchFile[curbatch] &&
698                         nbatch != hashtable->nbatch_original)
699                         break;                          /* must process due to rule 2 */
700                 if (hashtable->outerBatchFile[curbatch] &&
701                         nbatch != hashtable->nbatch_outstart)
702                         break;                          /* must process due to rule 3 */
703                 /* We can ignore this batch. */
704                 /* Release associated temp files right away. */
705                 if (hashtable->innerBatchFile[curbatch])
706                         BufFileClose(hashtable->innerBatchFile[curbatch]);
707                 hashtable->innerBatchFile[curbatch] = NULL;
708                 if (hashtable->outerBatchFile[curbatch])
709                         BufFileClose(hashtable->outerBatchFile[curbatch]);
710                 hashtable->outerBatchFile[curbatch] = NULL;
711                 curbatch++;
712         }
713
714         if (curbatch >= nbatch)
715                 return curbatch;                /* no more batches */
716
717         hashtable->curbatch = curbatch;
718
719         /*
720          * Reload the hash table with the new inner batch (which could be empty)
721          */
722         ExecHashTableReset(hashtable);
723
724         innerFile = hashtable->innerBatchFile[curbatch];
725
726         if (innerFile != NULL)
727         {
728                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
729                         ereport(ERROR,
730                                         (errcode_for_file_access(),
731                                    errmsg("could not rewind hash-join temporary file: %m")));
732
733                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
734                                                                                                  innerFile,
735                                                                                                  &hashvalue,
736                                                                                                  hjstate->hj_HashTupleSlot)))
737                 {
738                         /*
739                          * NOTE: some tuples may be sent to future batches.  Also, it is
740                          * possible for hashtable->nbatch to be increased here!
741                          */
742                         ExecHashTableInsert(hashtable, slot, hashvalue);
743                 }
744
745                 /*
746                  * after we build the hash table, the inner batch file is no longer
747                  * needed
748                  */
749                 BufFileClose(innerFile);
750                 hashtable->innerBatchFile[curbatch] = NULL;
751         }
752
753         /*
754          * If there's no outer batch file, advance to next batch.
755          */
756         if (hashtable->outerBatchFile[curbatch] == NULL)
757                 goto start_over;
758
759         /*
760          * Rewind outer batch file, so that we can start reading it.
761          */
762         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
763                 ereport(ERROR,
764                                 (errcode_for_file_access(),
765                                  errmsg("could not rewind hash-join temporary file: %m")));
766
767         return curbatch;
768 }
769
770 /*
771  * ExecHashJoinSaveTuple
772  *              save a tuple to a batch file.
773  *
774  * The data recorded in the file for each tuple is its hash value,
775  * then the tuple in MinimalTuple format.
776  *
777  * Note: it is important always to call this in the regular executor
778  * context, not in a shorter-lived context; else the temp file buffers
779  * will get messed up.
780  */
781 void
782 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
783                                           BufFile **fileptr)
784 {
785         BufFile    *file = *fileptr;
786         size_t          written;
787
788         if (file == NULL)
789         {
790                 /* First write to this batch file, so open it. */
791                 file = BufFileCreateTemp(false);
792                 *fileptr = file;
793         }
794
795         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
796         if (written != sizeof(uint32))
797                 ereport(ERROR,
798                                 (errcode_for_file_access(),
799                                  errmsg("could not write to hash-join temporary file: %m")));
800
801         written = BufFileWrite(file, (void *) tuple, tuple->t_len);
802         if (written != tuple->t_len)
803                 ereport(ERROR,
804                                 (errcode_for_file_access(),
805                                  errmsg("could not write to hash-join temporary file: %m")));
806 }
807
808 /*
809  * ExecHashJoinGetSavedTuple
810  *              read the next tuple from a batch file.  Return NULL if no more.
811  *
812  * On success, *hashvalue is set to the tuple's hash value, and the tuple
813  * itself is stored in the given slot.
814  */
815 static TupleTableSlot *
816 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
817                                                   BufFile *file,
818                                                   uint32 *hashvalue,
819                                                   TupleTableSlot *tupleSlot)
820 {
821         uint32          header[2];
822         size_t          nread;
823         MinimalTuple tuple;
824
825         /*
826          * Since both the hash value and the MinimalTuple length word are uint32,
827          * we can read them both in one BufFileRead() call without any type
828          * cheating.
829          */
830         nread = BufFileRead(file, (void *) header, sizeof(header));
831         if (nread == 0)                         /* end of file */
832         {
833                 ExecClearTuple(tupleSlot);
834                 return NULL;
835         }
836         if (nread != sizeof(header))
837                 ereport(ERROR,
838                                 (errcode_for_file_access(),
839                                  errmsg("could not read from hash-join temporary file: %m")));
840         *hashvalue = header[0];
841         tuple = (MinimalTuple) palloc(header[1]);
842         tuple->t_len = header[1];
843         nread = BufFileRead(file,
844                                                 (void *) ((char *) tuple + sizeof(uint32)),
845                                                 header[1] - sizeof(uint32));
846         if (nread != header[1] - sizeof(uint32))
847                 ereport(ERROR,
848                                 (errcode_for_file_access(),
849                                  errmsg("could not read from hash-join temporary file: %m")));
850         return ExecStoreMinimalTuple(tuple, tupleSlot, true);
851 }
852
853
854 void
855 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
856 {
857         /*
858          * In a multi-batch join, we currently have to do rescans the hard way,
859          * primarily because batch temp files may have already been released. But
860          * if it's a single-batch join, and there is no parameter change for the
861          * inner subnode, then we can just re-use the existing hash table without
862          * rebuilding it.
863          */
864         if (node->hj_HashTable != NULL)
865         {
866                 if (node->hj_HashTable->nbatch == 1 &&
867                         ((PlanState *) node)->righttree->chgParam == NULL)
868                 {
869                         /*
870                          * okay to reuse the hash table; needn't rescan inner, either.
871                          *
872                          * What we do need to do is reset our state about the emptiness of
873                          * the outer relation, so that the new scan of the outer will
874                          * update it correctly if it turns out to be empty this time.
875                          * (There's no harm in clearing it now because ExecHashJoin won't
876                          * need the info.  In the other cases, where the hash table
877                          * doesn't exist or we are destroying it, we leave this state
878                          * alone because ExecHashJoin will need it the first time
879                          * through.)
880                          */
881                         node->hj_OuterNotEmpty = false;
882                 }
883                 else
884                 {
885                         /* must destroy and rebuild hash table */
886                         ExecHashTableDestroy(node->hj_HashTable);
887                         node->hj_HashTable = NULL;
888
889                         /*
890                          * if chgParam of subnode is not null then plan will be re-scanned
891                          * by first ExecProcNode.
892                          */
893                         if (((PlanState *) node)->righttree->chgParam == NULL)
894                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
895                 }
896         }
897
898         /* Always reset intra-tuple state */
899         node->hj_CurHashValue = 0;
900         node->hj_CurBucketNo = 0;
901         node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
902         node->hj_CurTuple = NULL;
903
904         node->js.ps.ps_TupFromTlist = false;
905         node->hj_NeedNewOuter = true;
906         node->hj_MatchedOuter = false;
907         node->hj_FirstOuterTupleSlot = NULL;
908
909         /*
910          * if chgParam of subnode is not null then plan will be re-scanned by
911          * first ExecProcNode.
912          */
913         if (((PlanState *) node)->lefttree->chgParam == NULL)
914                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
915 }