]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c
bfe5be7c33c83ec185faac703070c4fec9f95280
[postgresql] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.93 2008/01/01 19:45:49 momjian Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "utils/memutils.h"
23
24
25 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
26                                                   HashJoinState *hjstate,
27                                                   uint32 *hashvalue);
28 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
29                                                   BufFile *file,
30                                                   uint32 *hashvalue,
31                                                   TupleTableSlot *tupleSlot);
32 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
33
34
35 /* ----------------------------------------------------------------
36  *              ExecHashJoin
37  *
38  *              This function implements the Hybrid Hashjoin algorithm.
39  *
40  *              Note: the relation we build hash table on is the "inner"
41  *                        the other one is "outer".
42  * ----------------------------------------------------------------
43  */
44 TupleTableSlot *                                /* return: a tuple or NULL */
45 ExecHashJoin(HashJoinState *node)
46 {
47         EState     *estate;
48         PlanState  *outerNode;
49         HashState  *hashNode;
50         List       *joinqual;
51         List       *otherqual;
52         TupleTableSlot *inntuple;
53         ExprContext *econtext;
54         ExprDoneCond isDone;
55         HashJoinTable hashtable;
56         HashJoinTuple curtuple;
57         TupleTableSlot *outerTupleSlot;
58         uint32          hashvalue;
59         int                     batchno;
60
61         /*
62          * get information from HashJoin node
63          */
64         estate = node->js.ps.state;
65         joinqual = node->js.joinqual;
66         otherqual = node->js.ps.qual;
67         hashNode = (HashState *) innerPlanState(node);
68         outerNode = outerPlanState(node);
69
70         /*
71          * get information from HashJoin state
72          */
73         hashtable = node->hj_HashTable;
74         econtext = node->js.ps.ps_ExprContext;
75
76         /*
77          * Check to see if we're still projecting out tuples from a previous join
78          * tuple (because there is a function-returning-set in the projection
79          * expressions).  If so, try to project another one.
80          */
81         if (node->js.ps.ps_TupFromTlist)
82         {
83                 TupleTableSlot *result;
84
85                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
86                 if (isDone == ExprMultipleResult)
87                         return result;
88                 /* Done with that source tuple... */
89                 node->js.ps.ps_TupFromTlist = false;
90         }
91
92         /*
93          * If we're doing an IN join, we want to return at most one row per outer
94          * tuple; so we can stop scanning the inner scan if we matched on the
95          * previous try.
96          */
97         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
98                 node->hj_NeedNewOuter = true;
99
100         /*
101          * Reset per-tuple memory context to free any expression evaluation
102          * storage allocated in the previous tuple cycle.  Note this can't happen
103          * until we're done projecting out tuples from a join tuple.
104          */
105         ResetExprContext(econtext);
106
107         /*
108          * if this is the first call, build the hash table for inner relation
109          */
110         if (hashtable == NULL)
111         {
112                 /*
113                  * If the outer relation is completely empty, we can quit without
114                  * building the hash table.  However, for an inner join it is only a
115                  * win to check this when the outer relation's startup cost is less
116                  * than the projected cost of building the hash table.  Otherwise it's
117                  * best to build the hash table first and see if the inner relation is
118                  * empty.  (When it's an outer join, we should always make this check,
119                  * since we aren't going to be able to skip the join on the strength
120                  * of an empty inner relation anyway.)
121                  *
122                  * If we are rescanning the join, we make use of information gained on
123                  * the previous scan: don't bother to try the prefetch if the previous
124                  * scan found the outer relation nonempty.      This is not 100% reliable
125                  * since with new parameters the outer relation might yield different
126                  * results, but it's a good heuristic.
127                  *
128                  * The only way to make the check is to try to fetch a tuple from the
129                  * outer plan node.  If we succeed, we have to stash it away for later
130                  * consumption by ExecHashJoinOuterGetTuple.
131                  */
132                 if (node->js.jointype == JOIN_LEFT ||
133                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
134                          !node->hj_OuterNotEmpty))
135                 {
136                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
137                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
138                         {
139                                 node->hj_OuterNotEmpty = false;
140                                 return NULL;
141                         }
142                         else
143                                 node->hj_OuterNotEmpty = true;
144                 }
145                 else
146                         node->hj_FirstOuterTupleSlot = NULL;
147
148                 /*
149                  * create the hash table
150                  */
151                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
152                                                                                 node->hj_HashOperators);
153                 node->hj_HashTable = hashtable;
154
155                 /*
156                  * execute the Hash node, to build the hash table
157                  */
158                 hashNode->hashtable = hashtable;
159                 (void) MultiExecProcNode((PlanState *) hashNode);
160
161                 /*
162                  * If the inner relation is completely empty, and we're not doing an
163                  * outer join, we can quit without scanning the outer relation.
164                  */
165                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
166                         return NULL;
167
168                 /*
169                  * need to remember whether nbatch has increased since we began
170                  * scanning the outer relation
171                  */
172                 hashtable->nbatch_outstart = hashtable->nbatch;
173
174                 /*
175                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
176                  * above, because ExecHashJoinOuterGetTuple will immediately set it
177                  * again.)
178                  */
179                 node->hj_OuterNotEmpty = false;
180         }
181
182         /*
183          * run the hash join process
184          */
185         for (;;)
186         {
187                 /*
188                  * If we don't have an outer tuple, get the next one
189                  */
190                 if (node->hj_NeedNewOuter)
191                 {
192                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
193                                                                                                            node,
194                                                                                                            &hashvalue);
195                         if (TupIsNull(outerTupleSlot))
196                         {
197                                 /* end of join */
198                                 return NULL;
199                         }
200
201                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
202                         econtext->ecxt_outertuple = outerTupleSlot;
203                         node->hj_NeedNewOuter = false;
204                         node->hj_MatchedOuter = false;
205
206                         /*
207                          * now we have an outer tuple, find the corresponding bucket for
208                          * this tuple from the hash table
209                          */
210                         node->hj_CurHashValue = hashvalue;
211                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
212                                                                           &node->hj_CurBucketNo, &batchno);
213                         node->hj_CurTuple = NULL;
214
215                         /*
216                          * Now we've got an outer tuple and the corresponding hash bucket,
217                          * but this tuple may not belong to the current batch.
218                          */
219                         if (batchno != hashtable->curbatch)
220                         {
221                                 /*
222                                  * Need to postpone this outer tuple to a later batch. Save it
223                                  * in the corresponding outer-batch file.
224                                  */
225                                 Assert(batchno > hashtable->curbatch);
226                                 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
227                                                                           hashvalue,
228                                                                           &hashtable->outerBatchFile[batchno]);
229                                 node->hj_NeedNewOuter = true;
230                                 continue;               /* loop around for a new outer tuple */
231                         }
232                 }
233
234                 /*
235                  * OK, scan the selected hash bucket for matches
236                  */
237                 for (;;)
238                 {
239                         curtuple = ExecScanHashBucket(node, econtext);
240                         if (curtuple == NULL)
241                                 break;                  /* out of matches */
242
243                         /*
244                          * we've got a match, but still need to test non-hashed quals
245                          */
246                         inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
247                                                                                          node->hj_HashTupleSlot,
248                                                                                          false);        /* don't pfree */
249                         econtext->ecxt_innertuple = inntuple;
250
251                         /* reset temp memory each time to avoid leaks from qual expr */
252                         ResetExprContext(econtext);
253
254                         /*
255                          * if we pass the qual, then save state for next call and have
256                          * ExecProject form the projection, store it in the tuple table,
257                          * and return the slot.
258                          *
259                          * Only the joinquals determine MatchedOuter status, but all quals
260                          * must pass to actually return the tuple.
261                          */
262                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
263                         {
264                                 node->hj_MatchedOuter = true;
265
266                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
267                                 {
268                                         TupleTableSlot *result;
269
270                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
271
272                                         if (isDone != ExprEndResult)
273                                         {
274                                                 node->js.ps.ps_TupFromTlist =
275                                                         (isDone == ExprMultipleResult);
276                                                 return result;
277                                         }
278                                 }
279
280                                 /*
281                                  * If we didn't return a tuple, may need to set NeedNewOuter
282                                  */
283                                 if (node->js.jointype == JOIN_IN)
284                                 {
285                                         node->hj_NeedNewOuter = true;
286                                         break;          /* out of loop over hash bucket */
287                                 }
288                         }
289                 }
290
291                 /*
292                  * Now the current outer tuple has run out of matches, so check
293                  * whether to emit a dummy outer-join tuple. If not, loop around to
294                  * get a new outer tuple.
295                  */
296                 node->hj_NeedNewOuter = true;
297
298                 if (!node->hj_MatchedOuter &&
299                         node->js.jointype == JOIN_LEFT)
300                 {
301                         /*
302                          * We are doing an outer join and there were no join matches for
303                          * this outer tuple.  Generate a fake join tuple with nulls for
304                          * the inner tuple, and return it if it passes the non-join quals.
305                          */
306                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
307
308                         if (ExecQual(otherqual, econtext, false))
309                         {
310                                 /*
311                                  * qualification was satisfied so we project and return the
312                                  * slot containing the result tuple using ExecProject().
313                                  */
314                                 TupleTableSlot *result;
315
316                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
317
318                                 if (isDone != ExprEndResult)
319                                 {
320                                         node->js.ps.ps_TupFromTlist =
321                                                 (isDone == ExprMultipleResult);
322                                         return result;
323                                 }
324                         }
325                 }
326         }
327 }
328
329 /* ----------------------------------------------------------------
330  *              ExecInitHashJoin
331  *
332  *              Init routine for HashJoin node.
333  * ----------------------------------------------------------------
334  */
335 HashJoinState *
336 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
337 {
338         HashJoinState *hjstate;
339         Plan       *outerNode;
340         Hash       *hashNode;
341         List       *lclauses;
342         List       *rclauses;
343         List       *hoperators;
344         ListCell   *l;
345
346         /* check for unsupported flags */
347         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
348
349         /*
350          * create state structure
351          */
352         hjstate = makeNode(HashJoinState);
353         hjstate->js.ps.plan = (Plan *) node;
354         hjstate->js.ps.state = estate;
355
356         /*
357          * Miscellaneous initialization
358          *
359          * create expression context for node
360          */
361         ExecAssignExprContext(estate, &hjstate->js.ps);
362
363         /*
364          * initialize child expressions
365          */
366         hjstate->js.ps.targetlist = (List *)
367                 ExecInitExpr((Expr *) node->join.plan.targetlist,
368                                          (PlanState *) hjstate);
369         hjstate->js.ps.qual = (List *)
370                 ExecInitExpr((Expr *) node->join.plan.qual,
371                                          (PlanState *) hjstate);
372         hjstate->js.jointype = node->join.jointype;
373         hjstate->js.joinqual = (List *)
374                 ExecInitExpr((Expr *) node->join.joinqual,
375                                          (PlanState *) hjstate);
376         hjstate->hashclauses = (List *)
377                 ExecInitExpr((Expr *) node->hashclauses,
378                                          (PlanState *) hjstate);
379
380         /*
381          * initialize child nodes
382          *
383          * Note: we could suppress the REWIND flag for the inner input, which
384          * would amount to betting that the hash will be a single batch.  Not
385          * clear if this would be a win or not.
386          */
387         outerNode = outerPlan(node);
388         hashNode = (Hash *) innerPlan(node);
389
390         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
391         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
392
393 #define HASHJOIN_NSLOTS 3
394
395         /*
396          * tuple table initialization
397          */
398         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
399         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
400
401         switch (node->join.jointype)
402         {
403                 case JOIN_INNER:
404                 case JOIN_IN:
405                         break;
406                 case JOIN_LEFT:
407                         hjstate->hj_NullInnerTupleSlot =
408                                 ExecInitNullTupleSlot(estate,
409                                                                  ExecGetResultType(innerPlanState(hjstate)));
410                         break;
411                 default:
412                         elog(ERROR, "unrecognized join type: %d",
413                                  (int) node->join.jointype);
414         }
415
416         /*
417          * now for some voodoo.  our temporary tuple slot is actually the result
418          * tuple slot of the Hash node (which is our inner plan).  we do this
419          * because Hash nodes don't return tuples via ExecProcNode() -- instead
420          * the hash join node uses ExecScanHashBucket() to get at the contents of
421          * the hash table.      -cim 6/9/91
422          */
423         {
424                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
425                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
426
427                 hjstate->hj_HashTupleSlot = slot;
428         }
429
430         /*
431          * initialize tuple type and projection info
432          */
433         ExecAssignResultTypeFromTL(&hjstate->js.ps);
434         ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
435
436         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
437                                                   ExecGetResultType(outerPlanState(hjstate)));
438
439         /*
440          * initialize hash-specific info
441          */
442         hjstate->hj_HashTable = NULL;
443         hjstate->hj_FirstOuterTupleSlot = NULL;
444
445         hjstate->hj_CurHashValue = 0;
446         hjstate->hj_CurBucketNo = 0;
447         hjstate->hj_CurTuple = NULL;
448
449         /*
450          * Deconstruct the hash clauses into outer and inner argument values, so
451          * that we can evaluate those subexpressions separately.  Also make a list
452          * of the hash operator OIDs, in preparation for looking up the hash
453          * functions to use.
454          */
455         lclauses = NIL;
456         rclauses = NIL;
457         hoperators = NIL;
458         foreach(l, hjstate->hashclauses)
459         {
460                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
461                 OpExpr     *hclause;
462
463                 Assert(IsA(fstate, FuncExprState));
464                 hclause = (OpExpr *) fstate->xprstate.expr;
465                 Assert(IsA(hclause, OpExpr));
466                 lclauses = lappend(lclauses, linitial(fstate->args));
467                 rclauses = lappend(rclauses, lsecond(fstate->args));
468                 hoperators = lappend_oid(hoperators, hclause->opno);
469         }
470         hjstate->hj_OuterHashKeys = lclauses;
471         hjstate->hj_InnerHashKeys = rclauses;
472         hjstate->hj_HashOperators = hoperators;
473         /* child Hash node needs to evaluate inner hash keys, too */
474         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
475
476         hjstate->js.ps.ps_OuterTupleSlot = NULL;
477         hjstate->js.ps.ps_TupFromTlist = false;
478         hjstate->hj_NeedNewOuter = true;
479         hjstate->hj_MatchedOuter = false;
480         hjstate->hj_OuterNotEmpty = false;
481
482         return hjstate;
483 }
484
485 int
486 ExecCountSlotsHashJoin(HashJoin *node)
487 {
488         return ExecCountSlotsNode(outerPlan(node)) +
489                 ExecCountSlotsNode(innerPlan(node)) +
490                 HASHJOIN_NSLOTS;
491 }
492
493 /* ----------------------------------------------------------------
494  *              ExecEndHashJoin
495  *
496  *              clean up routine for HashJoin node
497  * ----------------------------------------------------------------
498  */
499 void
500 ExecEndHashJoin(HashJoinState *node)
501 {
502         /*
503          * Free hash table
504          */
505         if (node->hj_HashTable)
506         {
507                 ExecHashTableDestroy(node->hj_HashTable);
508                 node->hj_HashTable = NULL;
509         }
510
511         /*
512          * Free the exprcontext
513          */
514         ExecFreeExprContext(&node->js.ps);
515
516         /*
517          * clean out the tuple table
518          */
519         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
520         ExecClearTuple(node->hj_OuterTupleSlot);
521         ExecClearTuple(node->hj_HashTupleSlot);
522
523         /*
524          * clean up subtrees
525          */
526         ExecEndNode(outerPlanState(node));
527         ExecEndNode(innerPlanState(node));
528 }
529
530 /*
531  * ExecHashJoinOuterGetTuple
532  *
533  *              get the next outer tuple for hashjoin: either by
534  *              executing a plan node in the first pass, or from
535  *              the temp files for the hashjoin batches.
536  *
537  * Returns a null slot if no more outer tuples.  On success, the tuple's
538  * hash value is stored at *hashvalue --- this is either originally computed,
539  * or re-read from the temp file.
540  */
541 static TupleTableSlot *
542 ExecHashJoinOuterGetTuple(PlanState *outerNode,
543                                                   HashJoinState *hjstate,
544                                                   uint32 *hashvalue)
545 {
546         HashJoinTable hashtable = hjstate->hj_HashTable;
547         int                     curbatch = hashtable->curbatch;
548         TupleTableSlot *slot;
549
550         if (curbatch == 0)                      /* if it is the first pass */
551         {
552                 /*
553                  * Check to see if first outer tuple was already fetched by
554                  * ExecHashJoin() and not used yet.
555                  */
556                 slot = hjstate->hj_FirstOuterTupleSlot;
557                 if (!TupIsNull(slot))
558                         hjstate->hj_FirstOuterTupleSlot = NULL;
559                 else
560                         slot = ExecProcNode(outerNode);
561
562                 while (!TupIsNull(slot))
563                 {
564                         /*
565                          * We have to compute the tuple's hash value.
566                          */
567                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
568
569                         econtext->ecxt_outertuple = slot;
570                         if (ExecHashGetHashValue(hashtable, econtext,
571                                                                          hjstate->hj_OuterHashKeys,
572                                                                          true,          /* outer tuple */
573                                                                          (hjstate->js.jointype == JOIN_LEFT),
574                                                                          hashvalue))
575                         {
576                                 /* remember outer relation is not empty for possible rescan */
577                                 hjstate->hj_OuterNotEmpty = true;
578
579                                 return slot;
580                         }
581
582                         /*
583                          * That tuple couldn't match because of a NULL, so discard it and
584                          * continue with the next one.
585                          */
586                         slot = ExecProcNode(outerNode);
587                 }
588
589                 /*
590                  * We have just reached the end of the first pass. Try to switch to a
591                  * saved batch.
592                  */
593                 curbatch = ExecHashJoinNewBatch(hjstate);
594         }
595
596         /*
597          * Try to read from a temp file. Loop allows us to advance to new batches
598          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
599          * don't try to optimize this loop.
600          */
601         while (curbatch < hashtable->nbatch)
602         {
603                 slot = ExecHashJoinGetSavedTuple(hjstate,
604                                                                                  hashtable->outerBatchFile[curbatch],
605                                                                                  hashvalue,
606                                                                                  hjstate->hj_OuterTupleSlot);
607                 if (!TupIsNull(slot))
608                         return slot;
609                 curbatch = ExecHashJoinNewBatch(hjstate);
610         }
611
612         /* Out of batches... */
613         return NULL;
614 }
615
616 /*
617  * ExecHashJoinNewBatch
618  *              switch to a new hashjoin batch
619  *
620  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
621  * We will never return a batch number that has an empty outer batch file.
622  */
623 static int
624 ExecHashJoinNewBatch(HashJoinState *hjstate)
625 {
626         HashJoinTable hashtable = hjstate->hj_HashTable;
627         int                     nbatch;
628         int                     curbatch;
629         BufFile    *innerFile;
630         TupleTableSlot *slot;
631         uint32          hashvalue;
632
633 start_over:
634         nbatch = hashtable->nbatch;
635         curbatch = hashtable->curbatch;
636
637         if (curbatch > 0)
638         {
639                 /*
640                  * We no longer need the previous outer batch file; close it right
641                  * away to free disk space.
642                  */
643                 if (hashtable->outerBatchFile[curbatch])
644                         BufFileClose(hashtable->outerBatchFile[curbatch]);
645                 hashtable->outerBatchFile[curbatch] = NULL;
646         }
647
648         /*
649          * We can always skip over any batches that are completely empty on both
650          * sides.  We can sometimes skip over batches that are empty on only one
651          * side, but there are exceptions:
652          *
653          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
654          * batch is empty.
655          *
656          * 2. If we have increased nbatch since the initial estimate, we have to
657          * scan inner batches since they might contain tuples that need to be
658          * reassigned to later inner batches.
659          *
660          * 3. Similarly, if we have increased nbatch since starting the outer
661          * scan, we have to rescan outer batches in case they contain tuples that
662          * need to be reassigned.
663          */
664         curbatch++;
665         while (curbatch < nbatch &&
666                    (hashtable->outerBatchFile[curbatch] == NULL ||
667                         hashtable->innerBatchFile[curbatch] == NULL))
668         {
669                 if (hashtable->outerBatchFile[curbatch] &&
670                         hjstate->js.jointype == JOIN_LEFT)
671                         break;                          /* must process due to rule 1 */
672                 if (hashtable->innerBatchFile[curbatch] &&
673                         nbatch != hashtable->nbatch_original)
674                         break;                          /* must process due to rule 2 */
675                 if (hashtable->outerBatchFile[curbatch] &&
676                         nbatch != hashtable->nbatch_outstart)
677                         break;                          /* must process due to rule 3 */
678                 /* We can ignore this batch. */
679                 /* Release associated temp files right away. */
680                 if (hashtable->innerBatchFile[curbatch])
681                         BufFileClose(hashtable->innerBatchFile[curbatch]);
682                 hashtable->innerBatchFile[curbatch] = NULL;
683                 if (hashtable->outerBatchFile[curbatch])
684                         BufFileClose(hashtable->outerBatchFile[curbatch]);
685                 hashtable->outerBatchFile[curbatch] = NULL;
686                 curbatch++;
687         }
688
689         if (curbatch >= nbatch)
690                 return curbatch;                /* no more batches */
691
692         hashtable->curbatch = curbatch;
693
694         /*
695          * Reload the hash table with the new inner batch (which could be empty)
696          */
697         ExecHashTableReset(hashtable);
698
699         innerFile = hashtable->innerBatchFile[curbatch];
700
701         if (innerFile != NULL)
702         {
703                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
704                         ereport(ERROR,
705                                         (errcode_for_file_access(),
706                                    errmsg("could not rewind hash-join temporary file: %m")));
707
708                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
709                                                                                                  innerFile,
710                                                                                                  &hashvalue,
711                                                                                                  hjstate->hj_HashTupleSlot)))
712                 {
713                         /*
714                          * NOTE: some tuples may be sent to future batches.  Also, it is
715                          * possible for hashtable->nbatch to be increased here!
716                          */
717                         ExecHashTableInsert(hashtable, slot, hashvalue);
718                 }
719
720                 /*
721                  * after we build the hash table, the inner batch file is no longer
722                  * needed
723                  */
724                 BufFileClose(innerFile);
725                 hashtable->innerBatchFile[curbatch] = NULL;
726         }
727
728         /*
729          * If there's no outer batch file, advance to next batch.
730          */
731         if (hashtable->outerBatchFile[curbatch] == NULL)
732                 goto start_over;
733
734         /*
735          * Rewind outer batch file, so that we can start reading it.
736          */
737         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
738                 ereport(ERROR,
739                                 (errcode_for_file_access(),
740                                  errmsg("could not rewind hash-join temporary file: %m")));
741
742         return curbatch;
743 }
744
745 /*
746  * ExecHashJoinSaveTuple
747  *              save a tuple to a batch file.
748  *
749  * The data recorded in the file for each tuple is its hash value,
750  * then the tuple in MinimalTuple format.
751  *
752  * Note: it is important always to call this in the regular executor
753  * context, not in a shorter-lived context; else the temp file buffers
754  * will get messed up.
755  */
756 void
757 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
758                                           BufFile **fileptr)
759 {
760         BufFile    *file = *fileptr;
761         size_t          written;
762
763         if (file == NULL)
764         {
765                 /* First write to this batch file, so open it. */
766                 file = BufFileCreateTemp(false);
767                 *fileptr = file;
768         }
769
770         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
771         if (written != sizeof(uint32))
772                 ereport(ERROR,
773                                 (errcode_for_file_access(),
774                                  errmsg("could not write to hash-join temporary file: %m")));
775
776         written = BufFileWrite(file, (void *) tuple, tuple->t_len);
777         if (written != tuple->t_len)
778                 ereport(ERROR,
779                                 (errcode_for_file_access(),
780                                  errmsg("could not write to hash-join temporary file: %m")));
781 }
782
783 /*
784  * ExecHashJoinGetSavedTuple
785  *              read the next tuple from a batch file.  Return NULL if no more.
786  *
787  * On success, *hashvalue is set to the tuple's hash value, and the tuple
788  * itself is stored in the given slot.
789  */
790 static TupleTableSlot *
791 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
792                                                   BufFile *file,
793                                                   uint32 *hashvalue,
794                                                   TupleTableSlot *tupleSlot)
795 {
796         uint32          header[2];
797         size_t          nread;
798         MinimalTuple tuple;
799
800         /*
801          * Since both the hash value and the MinimalTuple length word are uint32,
802          * we can read them both in one BufFileRead() call without any type
803          * cheating.
804          */
805         nread = BufFileRead(file, (void *) header, sizeof(header));
806         if (nread == 0)                         /* end of file */
807         {
808                 ExecClearTuple(tupleSlot);
809                 return NULL;
810         }
811         if (nread != sizeof(header))
812                 ereport(ERROR,
813                                 (errcode_for_file_access(),
814                                  errmsg("could not read from hash-join temporary file: %m")));
815         *hashvalue = header[0];
816         tuple = (MinimalTuple) palloc(header[1]);
817         tuple->t_len = header[1];
818         nread = BufFileRead(file,
819                                                 (void *) ((char *) tuple + sizeof(uint32)),
820                                                 header[1] - sizeof(uint32));
821         if (nread != header[1] - sizeof(uint32))
822                 ereport(ERROR,
823                                 (errcode_for_file_access(),
824                                  errmsg("could not read from hash-join temporary file: %m")));
825         return ExecStoreMinimalTuple(tuple, tupleSlot, true);
826 }
827
828
829 void
830 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
831 {
832         /*
833          * In a multi-batch join, we currently have to do rescans the hard way,
834          * primarily because batch temp files may have already been released. But
835          * if it's a single-batch join, and there is no parameter change for the
836          * inner subnode, then we can just re-use the existing hash table without
837          * rebuilding it.
838          */
839         if (node->hj_HashTable != NULL)
840         {
841                 if (node->hj_HashTable->nbatch == 1 &&
842                         ((PlanState *) node)->righttree->chgParam == NULL)
843                 {
844                         /*
845                          * okay to reuse the hash table; needn't rescan inner, either.
846                          *
847                          * What we do need to do is reset our state about the emptiness of
848                          * the outer relation, so that the new scan of the outer will
849                          * update it correctly if it turns out to be empty this time.
850                          * (There's no harm in clearing it now because ExecHashJoin won't
851                          * need the info.  In the other cases, where the hash table
852                          * doesn't exist or we are destroying it, we leave this state
853                          * alone because ExecHashJoin will need it the first time
854                          * through.)
855                          */
856                         node->hj_OuterNotEmpty = false;
857                 }
858                 else
859                 {
860                         /* must destroy and rebuild hash table */
861                         ExecHashTableDestroy(node->hj_HashTable);
862                         node->hj_HashTable = NULL;
863
864                         /*
865                          * if chgParam of subnode is not null then plan will be re-scanned
866                          * by first ExecProcNode.
867                          */
868                         if (((PlanState *) node)->righttree->chgParam == NULL)
869                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
870                 }
871         }
872
873         /* Always reset intra-tuple state */
874         node->hj_CurHashValue = 0;
875         node->hj_CurBucketNo = 0;
876         node->hj_CurTuple = NULL;
877
878         node->js.ps.ps_OuterTupleSlot = NULL;
879         node->js.ps.ps_TupFromTlist = false;
880         node->hj_NeedNewOuter = true;
881         node->hj_MatchedOuter = false;
882         node->hj_FirstOuterTupleSlot = NULL;
883
884         /*
885          * if chgParam of subnode is not null then plan will be re-scanned by
886          * first ExecProcNode.
887          */
888         if (((PlanState *) node)->lefttree->chgParam == NULL)
889                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
890 }