]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c
Update copyright for 2006. Update scripts.
[postgresql] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.81 2006/03/05 15:58:26 momjian Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "optimizer/clauses.h"
23 #include "utils/memutils.h"
24
25
26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
27                                                   HashJoinState *hjstate,
28                                                   uint32 *hashvalue);
29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
30                                                   BufFile *file,
31                                                   uint32 *hashvalue,
32                                                   TupleTableSlot *tupleSlot);
33 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
34
35
36 /* ----------------------------------------------------------------
37  *              ExecHashJoin
38  *
39  *              This function implements the Hybrid Hashjoin algorithm.
40  *
41  *              Note: the relation we build hash table on is the "inner"
42  *                        the other one is "outer".
43  * ----------------------------------------------------------------
44  */
45 TupleTableSlot *                                /* return: a tuple or NULL */
46 ExecHashJoin(HashJoinState *node)
47 {
48         EState     *estate;
49         PlanState  *outerNode;
50         HashState  *hashNode;
51         List       *joinqual;
52         List       *otherqual;
53         TupleTableSlot *inntuple;
54         ExprContext *econtext;
55         ExprDoneCond isDone;
56         HashJoinTable hashtable;
57         HeapTuple       curtuple;
58         TupleTableSlot *outerTupleSlot;
59         uint32          hashvalue;
60         int                     batchno;
61
62         /*
63          * get information from HashJoin node
64          */
65         estate = node->js.ps.state;
66         joinqual = node->js.joinqual;
67         otherqual = node->js.ps.qual;
68         hashNode = (HashState *) innerPlanState(node);
69         outerNode = outerPlanState(node);
70
71         /*
72          * get information from HashJoin state
73          */
74         hashtable = node->hj_HashTable;
75         econtext = node->js.ps.ps_ExprContext;
76
77         /*
78          * Check to see if we're still projecting out tuples from a previous join
79          * tuple (because there is a function-returning-set in the projection
80          * expressions).  If so, try to project another one.
81          */
82         if (node->js.ps.ps_TupFromTlist)
83         {
84                 TupleTableSlot *result;
85
86                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
87                 if (isDone == ExprMultipleResult)
88                         return result;
89                 /* Done with that source tuple... */
90                 node->js.ps.ps_TupFromTlist = false;
91         }
92
93         /*
94          * If we're doing an IN join, we want to return at most one row per outer
95          * tuple; so we can stop scanning the inner scan if we matched on the
96          * previous try.
97          */
98         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
99                 node->hj_NeedNewOuter = true;
100
101         /*
102          * Reset per-tuple memory context to free any expression evaluation
103          * storage allocated in the previous tuple cycle.  Note this can't happen
104          * until we're done projecting out tuples from a join tuple.
105          */
106         ResetExprContext(econtext);
107
108         /*
109          * if this is the first call, build the hash table for inner relation
110          */
111         if (hashtable == NULL)
112         {
113                 /*
114                  * If the outer relation is completely empty, we can quit without
115                  * building the hash table.  However, for an inner join it is only a
116                  * win to check this when the outer relation's startup cost is less
117                  * than the projected cost of building the hash table.  Otherwise it's
118                  * best to build the hash table first and see if the inner relation is
119                  * empty.  (When it's an outer join, we should always make this check,
120                  * since we aren't going to be able to skip the join on the strength
121                  * of an empty inner relation anyway.)
122                  *
123                  * If we are rescanning the join, we make use of information gained
124                  * on the previous scan: don't bother to try the prefetch if the
125                  * previous scan found the outer relation nonempty.  This is not
126                  * 100% reliable since with new parameters the outer relation might
127                  * yield different results, but it's a good heuristic.
128                  *
129                  * The only way to make the check is to try to fetch a tuple from the
130                  * outer plan node.  If we succeed, we have to stash it away for later
131                  * consumption by ExecHashJoinOuterGetTuple.
132                  */
133                 if (node->js.jointype == JOIN_LEFT ||
134                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
135                          !node->hj_OuterNotEmpty))
136                 {
137                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
138                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
139                         {
140                                 node->hj_OuterNotEmpty = false;
141                                 return NULL;
142                         }
143                         else
144                                 node->hj_OuterNotEmpty = true;
145                 }
146                 else
147                         node->hj_FirstOuterTupleSlot = NULL;
148
149                 /*
150                  * create the hash table
151                  */
152                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
153                                                                                 node->hj_HashOperators);
154                 node->hj_HashTable = hashtable;
155
156                 /*
157                  * execute the Hash node, to build the hash table
158                  */
159                 hashNode->hashtable = hashtable;
160                 (void) MultiExecProcNode((PlanState *) hashNode);
161
162                 /*
163                  * If the inner relation is completely empty, and we're not doing an
164                  * outer join, we can quit without scanning the outer relation.
165                  */
166                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
167                         return NULL;
168
169                 /*
170                  * need to remember whether nbatch has increased since we began
171                  * scanning the outer relation
172                  */
173                 hashtable->nbatch_outstart = hashtable->nbatch;
174
175                 /*
176                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
177                  * above, because ExecHashJoinOuterGetTuple will immediately
178                  * set it again.)
179                  */
180                 node->hj_OuterNotEmpty = false;
181         }
182
183         /*
184          * run the hash join process
185          */
186         for (;;)
187         {
188                 /*
189                  * If we don't have an outer tuple, get the next one
190                  */
191                 if (node->hj_NeedNewOuter)
192                 {
193                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
194                                                                                                            node,
195                                                                                                            &hashvalue);
196                         if (TupIsNull(outerTupleSlot))
197                         {
198                                 /* end of join */
199                                 return NULL;
200                         }
201
202                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
203                         econtext->ecxt_outertuple = outerTupleSlot;
204                         node->hj_NeedNewOuter = false;
205                         node->hj_MatchedOuter = false;
206
207                         /*
208                          * now we have an outer tuple, find the corresponding bucket for
209                          * this tuple from the hash table
210                          */
211                         node->hj_CurHashValue = hashvalue;
212                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
213                                                                           &node->hj_CurBucketNo, &batchno);
214                         node->hj_CurTuple = NULL;
215
216                         /*
217                          * Now we've got an outer tuple and the corresponding hash bucket,
218                          * but this tuple may not belong to the current batch.
219                          */
220                         if (batchno != hashtable->curbatch)
221                         {
222                                 /*
223                                  * Need to postpone this outer tuple to a later batch. Save it
224                                  * in the corresponding outer-batch file.
225                                  */
226                                 Assert(batchno > hashtable->curbatch);
227                                 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
228                                                                           hashvalue,
229                                                                           &hashtable->outerBatchFile[batchno]);
230                                 node->hj_NeedNewOuter = true;
231                                 continue;               /* loop around for a new outer tuple */
232                         }
233                 }
234
235                 /*
236                  * OK, scan the selected hash bucket for matches
237                  */
238                 for (;;)
239                 {
240                         curtuple = ExecScanHashBucket(node, econtext);
241                         if (curtuple == NULL)
242                                 break;                  /* out of matches */
243
244                         /*
245                          * we've got a match, but still need to test non-hashed quals
246                          */
247                         inntuple = ExecStoreTuple(curtuple,
248                                                                           node->hj_HashTupleSlot,
249                                                                           InvalidBuffer,
250                                                                           false);       /* don't pfree this tuple */
251                         econtext->ecxt_innertuple = inntuple;
252
253                         /* reset temp memory each time to avoid leaks from qual expr */
254                         ResetExprContext(econtext);
255
256                         /*
257                          * if we pass the qual, then save state for next call and have
258                          * ExecProject form the projection, store it in the tuple table,
259                          * and return the slot.
260                          *
261                          * Only the joinquals determine MatchedOuter status, but all quals
262                          * must pass to actually return the tuple.
263                          */
264                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
265                         {
266                                 node->hj_MatchedOuter = true;
267
268                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
269                                 {
270                                         TupleTableSlot *result;
271
272                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
273
274                                         if (isDone != ExprEndResult)
275                                         {
276                                                 node->js.ps.ps_TupFromTlist =
277                                                         (isDone == ExprMultipleResult);
278                                                 return result;
279                                         }
280                                 }
281
282                                 /*
283                                  * If we didn't return a tuple, may need to set NeedNewOuter
284                                  */
285                                 if (node->js.jointype == JOIN_IN)
286                                 {
287                                         node->hj_NeedNewOuter = true;
288                                         break;          /* out of loop over hash bucket */
289                                 }
290                         }
291                 }
292
293                 /*
294                  * Now the current outer tuple has run out of matches, so check
295                  * whether to emit a dummy outer-join tuple. If not, loop around to
296                  * get a new outer tuple.
297                  */
298                 node->hj_NeedNewOuter = true;
299
300                 if (!node->hj_MatchedOuter &&
301                         node->js.jointype == JOIN_LEFT)
302                 {
303                         /*
304                          * We are doing an outer join and there were no join matches for
305                          * this outer tuple.  Generate a fake join tuple with nulls for
306                          * the inner tuple, and return it if it passes the non-join quals.
307                          */
308                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
309
310                         if (ExecQual(otherqual, econtext, false))
311                         {
312                                 /*
313                                  * qualification was satisfied so we project and return the
314                                  * slot containing the result tuple using ExecProject().
315                                  */
316                                 TupleTableSlot *result;
317
318                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
319
320                                 if (isDone != ExprEndResult)
321                                 {
322                                         node->js.ps.ps_TupFromTlist =
323                                                 (isDone == ExprMultipleResult);
324                                         return result;
325                                 }
326                         }
327                 }
328         }
329 }
330
331 /* ----------------------------------------------------------------
332  *              ExecInitHashJoin
333  *
334  *              Init routine for HashJoin node.
335  * ----------------------------------------------------------------
336  */
337 HashJoinState *
338 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
339 {
340         HashJoinState *hjstate;
341         Plan       *outerNode;
342         Hash       *hashNode;
343         List       *lclauses;
344         List       *rclauses;
345         List       *hoperators;
346         ListCell   *l;
347
348         /* check for unsupported flags */
349         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
350
351         /*
352          * create state structure
353          */
354         hjstate = makeNode(HashJoinState);
355         hjstate->js.ps.plan = (Plan *) node;
356         hjstate->js.ps.state = estate;
357
358         /*
359          * Miscellaneous initialization
360          *
361          * create expression context for node
362          */
363         ExecAssignExprContext(estate, &hjstate->js.ps);
364
365         /*
366          * initialize child expressions
367          */
368         hjstate->js.ps.targetlist = (List *)
369                 ExecInitExpr((Expr *) node->join.plan.targetlist,
370                                          (PlanState *) hjstate);
371         hjstate->js.ps.qual = (List *)
372                 ExecInitExpr((Expr *) node->join.plan.qual,
373                                          (PlanState *) hjstate);
374         hjstate->js.jointype = node->join.jointype;
375         hjstate->js.joinqual = (List *)
376                 ExecInitExpr((Expr *) node->join.joinqual,
377                                          (PlanState *) hjstate);
378         hjstate->hashclauses = (List *)
379                 ExecInitExpr((Expr *) node->hashclauses,
380                                          (PlanState *) hjstate);
381
382         /*
383          * initialize child nodes
384          *
385          * Note: we could suppress the REWIND flag for the inner input, which
386          * would amount to betting that the hash will be a single batch.  Not
387          * clear if this would be a win or not.
388          */
389         outerNode = outerPlan(node);
390         hashNode = (Hash *) innerPlan(node);
391
392         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
393         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
394
395 #define HASHJOIN_NSLOTS 3
396
397         /*
398          * tuple table initialization
399          */
400         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
401         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
402
403         switch (node->join.jointype)
404         {
405                 case JOIN_INNER:
406                 case JOIN_IN:
407                         break;
408                 case JOIN_LEFT:
409                         hjstate->hj_NullInnerTupleSlot =
410                                 ExecInitNullTupleSlot(estate,
411                                                                  ExecGetResultType(innerPlanState(hjstate)));
412                         break;
413                 default:
414                         elog(ERROR, "unrecognized join type: %d",
415                                  (int) node->join.jointype);
416         }
417
418         /*
419          * now for some voodoo.  our temporary tuple slot is actually the result
420          * tuple slot of the Hash node (which is our inner plan).  we do this
421          * because Hash nodes don't return tuples via ExecProcNode() -- instead
422          * the hash join node uses ExecScanHashBucket() to get at the contents of
423          * the hash table.      -cim 6/9/91
424          */
425         {
426                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
427                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
428
429                 hjstate->hj_HashTupleSlot = slot;
430         }
431
432         /*
433          * initialize tuple type and projection info
434          */
435         ExecAssignResultTypeFromTL(&hjstate->js.ps);
436         ExecAssignProjectionInfo(&hjstate->js.ps);
437
438         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
439                                                   ExecGetResultType(outerPlanState(hjstate)),
440                                                   false);
441
442         /*
443          * initialize hash-specific info
444          */
445         hjstate->hj_HashTable = NULL;
446         hjstate->hj_FirstOuterTupleSlot = NULL;
447
448         hjstate->hj_CurHashValue = 0;
449         hjstate->hj_CurBucketNo = 0;
450         hjstate->hj_CurTuple = NULL;
451
452         /*
453          * Deconstruct the hash clauses into outer and inner argument values, so
454          * that we can evaluate those subexpressions separately.  Also make a list
455          * of the hash operator OIDs, in preparation for looking up the hash
456          * functions to use.
457          */
458         lclauses = NIL;
459         rclauses = NIL;
460         hoperators = NIL;
461         foreach(l, hjstate->hashclauses)
462         {
463                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
464                 OpExpr     *hclause;
465
466                 Assert(IsA(fstate, FuncExprState));
467                 hclause = (OpExpr *) fstate->xprstate.expr;
468                 Assert(IsA(hclause, OpExpr));
469                 lclauses = lappend(lclauses, linitial(fstate->args));
470                 rclauses = lappend(rclauses, lsecond(fstate->args));
471                 hoperators = lappend_oid(hoperators, hclause->opno);
472         }
473         hjstate->hj_OuterHashKeys = lclauses;
474         hjstate->hj_InnerHashKeys = rclauses;
475         hjstate->hj_HashOperators = hoperators;
476         /* child Hash node needs to evaluate inner hash keys, too */
477         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
478
479         hjstate->js.ps.ps_OuterTupleSlot = NULL;
480         hjstate->js.ps.ps_TupFromTlist = false;
481         hjstate->hj_NeedNewOuter = true;
482         hjstate->hj_MatchedOuter = false;
483         hjstate->hj_OuterNotEmpty = false;
484
485         return hjstate;
486 }
487
488 int
489 ExecCountSlotsHashJoin(HashJoin *node)
490 {
491         return ExecCountSlotsNode(outerPlan(node)) +
492                 ExecCountSlotsNode(innerPlan(node)) +
493                 HASHJOIN_NSLOTS;
494 }
495
496 /* ----------------------------------------------------------------
497  *              ExecEndHashJoin
498  *
499  *              clean up routine for HashJoin node
500  * ----------------------------------------------------------------
501  */
502 void
503 ExecEndHashJoin(HashJoinState *node)
504 {
505         /*
506          * Free hash table
507          */
508         if (node->hj_HashTable)
509         {
510                 ExecHashTableDestroy(node->hj_HashTable);
511                 node->hj_HashTable = NULL;
512         }
513
514         /*
515          * Free the exprcontext
516          */
517         ExecFreeExprContext(&node->js.ps);
518
519         /*
520          * clean out the tuple table
521          */
522         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
523         ExecClearTuple(node->hj_OuterTupleSlot);
524         ExecClearTuple(node->hj_HashTupleSlot);
525
526         /*
527          * clean up subtrees
528          */
529         ExecEndNode(outerPlanState(node));
530         ExecEndNode(innerPlanState(node));
531 }
532
533 /*
534  * ExecHashJoinOuterGetTuple
535  *
536  *              get the next outer tuple for hashjoin: either by
537  *              executing a plan node in the first pass, or from
538  *              the temp files for the hashjoin batches.
539  *
540  * Returns a null slot if no more outer tuples.  On success, the tuple's
541  * hash value is stored at *hashvalue --- this is either originally computed,
542  * or re-read from the temp file.
543  */
544 static TupleTableSlot *
545 ExecHashJoinOuterGetTuple(PlanState *outerNode,
546                                                   HashJoinState *hjstate,
547                                                   uint32 *hashvalue)
548 {
549         HashJoinTable hashtable = hjstate->hj_HashTable;
550         int                     curbatch = hashtable->curbatch;
551         TupleTableSlot *slot;
552
553         if (curbatch == 0)
554         {                                                       /* if it is the first pass */
555
556                 /*
557                  * Check to see if first outer tuple was already fetched by
558                  * ExecHashJoin() and not used yet.
559                  */
560                 slot = hjstate->hj_FirstOuterTupleSlot;
561                 if (!TupIsNull(slot))
562                         hjstate->hj_FirstOuterTupleSlot = NULL;
563                 else
564                         slot = ExecProcNode(outerNode);
565                 if (!TupIsNull(slot))
566                 {
567                         /*
568                          * We have to compute the tuple's hash value.
569                          */
570                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
571
572                         econtext->ecxt_outertuple = slot;
573                         *hashvalue = ExecHashGetHashValue(hashtable, econtext,
574                                                                                           hjstate->hj_OuterHashKeys);
575
576                         /* remember outer relation is not empty for possible rescan */
577                         hjstate->hj_OuterNotEmpty = true;
578
579                         return slot;
580                 }
581
582                 /*
583                  * We have just reached the end of the first pass. Try to switch to a
584                  * saved batch.
585                  */
586                 curbatch = ExecHashJoinNewBatch(hjstate);
587         }
588
589         /*
590          * Try to read from a temp file. Loop allows us to advance to new batches
591          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
592          * don't try to optimize this loop.
593          */
594         while (curbatch < hashtable->nbatch)
595         {
596                 slot = ExecHashJoinGetSavedTuple(hjstate,
597                                                                                  hashtable->outerBatchFile[curbatch],
598                                                                                  hashvalue,
599                                                                                  hjstate->hj_OuterTupleSlot);
600                 if (!TupIsNull(slot))
601                         return slot;
602                 curbatch = ExecHashJoinNewBatch(hjstate);
603         }
604
605         /* Out of batches... */
606         return NULL;
607 }
608
609 /*
610  * ExecHashJoinNewBatch
611  *              switch to a new hashjoin batch
612  *
613  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
614  * We will never return a batch number that has an empty outer batch file.
615  */
616 static int
617 ExecHashJoinNewBatch(HashJoinState *hjstate)
618 {
619         HashJoinTable hashtable = hjstate->hj_HashTable;
620         int                     nbatch;
621         int                     curbatch;
622         BufFile    *innerFile;
623         TupleTableSlot *slot;
624         uint32          hashvalue;
625
626 start_over:
627         nbatch = hashtable->nbatch;
628         curbatch = hashtable->curbatch;
629
630         if (curbatch > 0)
631         {
632                 /*
633                  * We no longer need the previous outer batch file; close it right
634                  * away to free disk space.
635                  */
636                 if (hashtable->outerBatchFile[curbatch])
637                         BufFileClose(hashtable->outerBatchFile[curbatch]);
638                 hashtable->outerBatchFile[curbatch] = NULL;
639         }
640
641         /*
642          * We can always skip over any batches that are completely empty on both
643          * sides.  We can sometimes skip over batches that are empty on only one
644          * side, but there are exceptions:
645          *
646          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
647          * batch is empty.
648          *
649          * 2. If we have increased nbatch since the initial estimate, we have to
650          * scan inner batches since they might contain tuples that need to be
651          * reassigned to later inner batches.
652          *
653          * 3. Similarly, if we have increased nbatch since starting the outer
654          * scan, we have to rescan outer batches in case they contain tuples that
655          * need to be reassigned.
656          */
657         curbatch++;
658         while (curbatch < nbatch &&
659                    (hashtable->outerBatchFile[curbatch] == NULL ||
660                         hashtable->innerBatchFile[curbatch] == NULL))
661         {
662                 if (hashtable->outerBatchFile[curbatch] &&
663                         hjstate->js.jointype == JOIN_LEFT)
664                         break;                          /* must process due to rule 1 */
665                 if (hashtable->innerBatchFile[curbatch] &&
666                         nbatch != hashtable->nbatch_original)
667                         break;                          /* must process due to rule 2 */
668                 if (hashtable->outerBatchFile[curbatch] &&
669                         nbatch != hashtable->nbatch_outstart)
670                         break;                          /* must process due to rule 3 */
671                 /* We can ignore this batch. */
672                 /* Release associated temp files right away. */
673                 if (hashtable->innerBatchFile[curbatch])
674                         BufFileClose(hashtable->innerBatchFile[curbatch]);
675                 hashtable->innerBatchFile[curbatch] = NULL;
676                 if (hashtable->outerBatchFile[curbatch])
677                         BufFileClose(hashtable->outerBatchFile[curbatch]);
678                 hashtable->outerBatchFile[curbatch] = NULL;
679                 curbatch++;
680         }
681
682         if (curbatch >= nbatch)
683                 return curbatch;                /* no more batches */
684
685         hashtable->curbatch = curbatch;
686
687         /*
688          * Reload the hash table with the new inner batch (which could be empty)
689          */
690         ExecHashTableReset(hashtable);
691
692         innerFile = hashtable->innerBatchFile[curbatch];
693
694         if (innerFile != NULL)
695         {
696                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
697                         ereport(ERROR,
698                                         (errcode_for_file_access(),
699                                    errmsg("could not rewind hash-join temporary file: %m")));
700
701                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
702                                                                                                  innerFile,
703                                                                                                  &hashvalue,
704                                                                                                  hjstate->hj_HashTupleSlot)))
705                 {
706                         /*
707                          * NOTE: some tuples may be sent to future batches.  Also, it is
708                          * possible for hashtable->nbatch to be increased here!
709                          */
710                         ExecHashTableInsert(hashtable,
711                                                                 ExecFetchSlotTuple(slot),
712                                                                 hashvalue);
713                 }
714
715                 /*
716                  * after we build the hash table, the inner batch file is no longer
717                  * needed
718                  */
719                 BufFileClose(innerFile);
720                 hashtable->innerBatchFile[curbatch] = NULL;
721         }
722
723         /*
724          * If there's no outer batch file, advance to next batch.
725          */
726         if (hashtable->outerBatchFile[curbatch] == NULL)
727                 goto start_over;
728
729         /*
730          * Rewind outer batch file, so that we can start reading it.
731          */
732         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
733                 ereport(ERROR,
734                                 (errcode_for_file_access(),
735                                  errmsg("could not rewind hash-join temporary file: %m")));
736
737         return curbatch;
738 }
739
740 /*
741  * ExecHashJoinSaveTuple
742  *              save a tuple to a batch file.
743  *
744  * The data recorded in the file for each tuple is its hash value,
745  * then an image of its HeapTupleData (with meaningless t_data pointer)
746  * followed by the HeapTupleHeader and tuple data.
747  *
748  * Note: it is important always to call this in the regular executor
749  * context, not in a shorter-lived context; else the temp file buffers
750  * will get messed up.
751  */
752 void
753 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
754                                           BufFile **fileptr)
755 {
756         BufFile    *file = *fileptr;
757         size_t          written;
758
759         if (file == NULL)
760         {
761                 /* First write to this batch file, so open it. */
762                 file = BufFileCreateTemp(false);
763                 *fileptr = file;
764         }
765
766         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
767         if (written != sizeof(uint32))
768                 ereport(ERROR,
769                                 (errcode_for_file_access(),
770                                  errmsg("could not write to hash-join temporary file: %m")));
771
772         written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
773         if (written != sizeof(HeapTupleData))
774                 ereport(ERROR,
775                                 (errcode_for_file_access(),
776                                  errmsg("could not write to hash-join temporary file: %m")));
777
778         written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
779         if (written != (size_t) heapTuple->t_len)
780                 ereport(ERROR,
781                                 (errcode_for_file_access(),
782                                  errmsg("could not write to hash-join temporary file: %m")));
783 }
784
785 /*
786  * ExecHashJoinGetSavedTuple
787  *              read the next tuple from a batch file.  Return NULL if no more.
788  *
789  * On success, *hashvalue is set to the tuple's hash value, and the tuple
790  * itself is stored in the given slot.
791  */
792 static TupleTableSlot *
793 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
794                                                   BufFile *file,
795                                                   uint32 *hashvalue,
796                                                   TupleTableSlot *tupleSlot)
797 {
798         HeapTupleData htup;
799         size_t          nread;
800         HeapTuple       heapTuple;
801
802         nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
803         if (nread == 0)
804                 return NULL;                    /* end of file */
805         if (nread != sizeof(uint32))
806                 ereport(ERROR,
807                                 (errcode_for_file_access(),
808                                  errmsg("could not read from hash-join temporary file: %m")));
809         nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
810         if (nread != sizeof(HeapTupleData))
811                 ereport(ERROR,
812                                 (errcode_for_file_access(),
813                                  errmsg("could not read from hash-join temporary file: %m")));
814         heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
815         memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
816         heapTuple->t_data = (HeapTupleHeader)
817                 ((char *) heapTuple + HEAPTUPLESIZE);
818         nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
819         if (nread != (size_t) htup.t_len)
820                 ereport(ERROR,
821                                 (errcode_for_file_access(),
822                                  errmsg("could not read from hash-join temporary file: %m")));
823         return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
824 }
825
826
827 void
828 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
829 {
830         /*
831          * In a multi-batch join, we currently have to do rescans the hard way,
832          * primarily because batch temp files may have already been released. But
833          * if it's a single-batch join, and there is no parameter change for the
834          * inner subnode, then we can just re-use the existing hash table without
835          * rebuilding it.
836          */
837         if (node->hj_HashTable != NULL)
838         {
839                 if (node->hj_HashTable->nbatch == 1 &&
840                         ((PlanState *) node)->righttree->chgParam == NULL)
841                 {
842                         /*
843                          * okay to reuse the hash table; needn't rescan inner, either.
844                          *
845                          * What we do need to do is reset our state about the emptiness
846                          * of the outer relation, so that the new scan of the outer will
847                          * update it correctly if it turns out to be empty this time.
848                          * (There's no harm in clearing it now because ExecHashJoin won't
849                          * need the info.  In the other cases, where the hash table
850                          * doesn't exist or we are destroying it, we leave this state
851                          * alone because ExecHashJoin will need it the first time
852                          * through.)
853                          */
854                         node->hj_OuterNotEmpty = false;
855                 }
856                 else
857                 {
858                         /* must destroy and rebuild hash table */
859                         ExecHashTableDestroy(node->hj_HashTable);
860                         node->hj_HashTable = NULL;
861
862                         /*
863                          * if chgParam of subnode is not null then plan will be re-scanned
864                          * by first ExecProcNode.
865                          */
866                         if (((PlanState *) node)->righttree->chgParam == NULL)
867                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
868                 }
869         }
870
871         /* Always reset intra-tuple state */
872         node->hj_CurHashValue = 0;
873         node->hj_CurBucketNo = 0;
874         node->hj_CurTuple = NULL;
875
876         node->js.ps.ps_OuterTupleSlot = NULL;
877         node->js.ps.ps_TupFromTlist = false;
878         node->hj_NeedNewOuter = true;
879         node->hj_MatchedOuter = false;
880         node->hj_FirstOuterTupleSlot = NULL;
881
882         /*
883          * if chgParam of subnode is not null then plan will be re-scanned by
884          * first ExecProcNode.
885          */
886         if (((PlanState *) node)->lefttree->chgParam == NULL)
887                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
888 }