]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c
Update CVS HEAD for 2007 copyright. Back branches are typically not
[postgresql] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.86 2007/01/05 22:19:28 momjian Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "utils/memutils.h"
23
24
25 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
26                                                   HashJoinState *hjstate,
27                                                   uint32 *hashvalue);
28 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
29                                                   BufFile *file,
30                                                   uint32 *hashvalue,
31                                                   TupleTableSlot *tupleSlot);
32 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
33
34
35 /* ----------------------------------------------------------------
36  *              ExecHashJoin
37  *
38  *              This function implements the Hybrid Hashjoin algorithm.
39  *
40  *              Note: the relation we build hash table on is the "inner"
41  *                        the other one is "outer".
42  * ----------------------------------------------------------------
43  */
44 TupleTableSlot *                                /* return: a tuple or NULL */
45 ExecHashJoin(HashJoinState *node)
46 {
47         EState     *estate;
48         PlanState  *outerNode;
49         HashState  *hashNode;
50         List       *joinqual;
51         List       *otherqual;
52         TupleTableSlot *inntuple;
53         ExprContext *econtext;
54         ExprDoneCond isDone;
55         HashJoinTable hashtable;
56         HashJoinTuple curtuple;
57         TupleTableSlot *outerTupleSlot;
58         uint32          hashvalue;
59         int                     batchno;
60
61         /*
62          * get information from HashJoin node
63          */
64         estate = node->js.ps.state;
65         joinqual = node->js.joinqual;
66         otherqual = node->js.ps.qual;
67         hashNode = (HashState *) innerPlanState(node);
68         outerNode = outerPlanState(node);
69
70         /*
71          * get information from HashJoin state
72          */
73         hashtable = node->hj_HashTable;
74         econtext = node->js.ps.ps_ExprContext;
75
76         /*
77          * Check to see if we're still projecting out tuples from a previous join
78          * tuple (because there is a function-returning-set in the projection
79          * expressions).  If so, try to project another one.
80          */
81         if (node->js.ps.ps_TupFromTlist)
82         {
83                 TupleTableSlot *result;
84
85                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
86                 if (isDone == ExprMultipleResult)
87                         return result;
88                 /* Done with that source tuple... */
89                 node->js.ps.ps_TupFromTlist = false;
90         }
91
92         /*
93          * If we're doing an IN join, we want to return at most one row per outer
94          * tuple; so we can stop scanning the inner scan if we matched on the
95          * previous try.
96          */
97         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
98                 node->hj_NeedNewOuter = true;
99
100         /*
101          * Reset per-tuple memory context to free any expression evaluation
102          * storage allocated in the previous tuple cycle.  Note this can't happen
103          * until we're done projecting out tuples from a join tuple.
104          */
105         ResetExprContext(econtext);
106
107         /*
108          * if this is the first call, build the hash table for inner relation
109          */
110         if (hashtable == NULL)
111         {
112                 /*
113                  * If the outer relation is completely empty, we can quit without
114                  * building the hash table.  However, for an inner join it is only a
115                  * win to check this when the outer relation's startup cost is less
116                  * than the projected cost of building the hash table.  Otherwise it's
117                  * best to build the hash table first and see if the inner relation is
118                  * empty.  (When it's an outer join, we should always make this check,
119                  * since we aren't going to be able to skip the join on the strength
120                  * of an empty inner relation anyway.)
121                  *
122                  * If we are rescanning the join, we make use of information gained on
123                  * the previous scan: don't bother to try the prefetch if the previous
124                  * scan found the outer relation nonempty.      This is not 100% reliable
125                  * since with new parameters the outer relation might yield different
126                  * results, but it's a good heuristic.
127                  *
128                  * The only way to make the check is to try to fetch a tuple from the
129                  * outer plan node.  If we succeed, we have to stash it away for later
130                  * consumption by ExecHashJoinOuterGetTuple.
131                  */
132                 if (node->js.jointype == JOIN_LEFT ||
133                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
134                          !node->hj_OuterNotEmpty))
135                 {
136                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
137                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
138                         {
139                                 node->hj_OuterNotEmpty = false;
140                                 return NULL;
141                         }
142                         else
143                                 node->hj_OuterNotEmpty = true;
144                 }
145                 else
146                         node->hj_FirstOuterTupleSlot = NULL;
147
148                 /*
149                  * create the hash table
150                  */
151                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
152                                                                                 node->hj_HashOperators);
153                 node->hj_HashTable = hashtable;
154
155                 /*
156                  * execute the Hash node, to build the hash table
157                  */
158                 hashNode->hashtable = hashtable;
159                 (void) MultiExecProcNode((PlanState *) hashNode);
160
161                 /*
162                  * If the inner relation is completely empty, and we're not doing an
163                  * outer join, we can quit without scanning the outer relation.
164                  */
165                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
166                         return NULL;
167
168                 /*
169                  * need to remember whether nbatch has increased since we began
170                  * scanning the outer relation
171                  */
172                 hashtable->nbatch_outstart = hashtable->nbatch;
173
174                 /*
175                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
176                  * above, because ExecHashJoinOuterGetTuple will immediately set it
177                  * again.)
178                  */
179                 node->hj_OuterNotEmpty = false;
180         }
181
182         /*
183          * run the hash join process
184          */
185         for (;;)
186         {
187                 /*
188                  * If we don't have an outer tuple, get the next one
189                  */
190                 if (node->hj_NeedNewOuter)
191                 {
192                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
193                                                                                                            node,
194                                                                                                            &hashvalue);
195                         if (TupIsNull(outerTupleSlot))
196                         {
197                                 /* end of join */
198                                 return NULL;
199                         }
200
201                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
202                         econtext->ecxt_outertuple = outerTupleSlot;
203                         node->hj_NeedNewOuter = false;
204                         node->hj_MatchedOuter = false;
205
206                         /*
207                          * now we have an outer tuple, find the corresponding bucket for
208                          * this tuple from the hash table
209                          */
210                         node->hj_CurHashValue = hashvalue;
211                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
212                                                                           &node->hj_CurBucketNo, &batchno);
213                         node->hj_CurTuple = NULL;
214
215                         /*
216                          * Now we've got an outer tuple and the corresponding hash bucket,
217                          * but this tuple may not belong to the current batch.
218                          */
219                         if (batchno != hashtable->curbatch)
220                         {
221                                 /*
222                                  * Need to postpone this outer tuple to a later batch. Save it
223                                  * in the corresponding outer-batch file.
224                                  */
225                                 Assert(batchno > hashtable->curbatch);
226                                 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
227                                                                           hashvalue,
228                                                                           &hashtable->outerBatchFile[batchno]);
229                                 node->hj_NeedNewOuter = true;
230                                 continue;               /* loop around for a new outer tuple */
231                         }
232                 }
233
234                 /*
235                  * OK, scan the selected hash bucket for matches
236                  */
237                 for (;;)
238                 {
239                         curtuple = ExecScanHashBucket(node, econtext);
240                         if (curtuple == NULL)
241                                 break;                  /* out of matches */
242
243                         /*
244                          * we've got a match, but still need to test non-hashed quals
245                          */
246                         inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
247                                                                                          node->hj_HashTupleSlot,
248                                                                                          false);        /* don't pfree */
249                         econtext->ecxt_innertuple = inntuple;
250
251                         /* reset temp memory each time to avoid leaks from qual expr */
252                         ResetExprContext(econtext);
253
254                         /*
255                          * if we pass the qual, then save state for next call and have
256                          * ExecProject form the projection, store it in the tuple table,
257                          * and return the slot.
258                          *
259                          * Only the joinquals determine MatchedOuter status, but all quals
260                          * must pass to actually return the tuple.
261                          */
262                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
263                         {
264                                 node->hj_MatchedOuter = true;
265
266                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
267                                 {
268                                         TupleTableSlot *result;
269
270                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
271
272                                         if (isDone != ExprEndResult)
273                                         {
274                                                 node->js.ps.ps_TupFromTlist =
275                                                         (isDone == ExprMultipleResult);
276                                                 return result;
277                                         }
278                                 }
279
280                                 /*
281                                  * If we didn't return a tuple, may need to set NeedNewOuter
282                                  */
283                                 if (node->js.jointype == JOIN_IN)
284                                 {
285                                         node->hj_NeedNewOuter = true;
286                                         break;          /* out of loop over hash bucket */
287                                 }
288                         }
289                 }
290
291                 /*
292                  * Now the current outer tuple has run out of matches, so check
293                  * whether to emit a dummy outer-join tuple. If not, loop around to
294                  * get a new outer tuple.
295                  */
296                 node->hj_NeedNewOuter = true;
297
298                 if (!node->hj_MatchedOuter &&
299                         node->js.jointype == JOIN_LEFT)
300                 {
301                         /*
302                          * We are doing an outer join and there were no join matches for
303                          * this outer tuple.  Generate a fake join tuple with nulls for
304                          * the inner tuple, and return it if it passes the non-join quals.
305                          */
306                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
307
308                         if (ExecQual(otherqual, econtext, false))
309                         {
310                                 /*
311                                  * qualification was satisfied so we project and return the
312                                  * slot containing the result tuple using ExecProject().
313                                  */
314                                 TupleTableSlot *result;
315
316                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
317
318                                 if (isDone != ExprEndResult)
319                                 {
320                                         node->js.ps.ps_TupFromTlist =
321                                                 (isDone == ExprMultipleResult);
322                                         return result;
323                                 }
324                         }
325                 }
326         }
327 }
328
329 /* ----------------------------------------------------------------
330  *              ExecInitHashJoin
331  *
332  *              Init routine for HashJoin node.
333  * ----------------------------------------------------------------
334  */
335 HashJoinState *
336 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
337 {
338         HashJoinState *hjstate;
339         Plan       *outerNode;
340         Hash       *hashNode;
341         List       *lclauses;
342         List       *rclauses;
343         List       *hoperators;
344         ListCell   *l;
345
346         /* check for unsupported flags */
347         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
348
349         /*
350          * create state structure
351          */
352         hjstate = makeNode(HashJoinState);
353         hjstate->js.ps.plan = (Plan *) node;
354         hjstate->js.ps.state = estate;
355
356         /*
357          * Miscellaneous initialization
358          *
359          * create expression context for node
360          */
361         ExecAssignExprContext(estate, &hjstate->js.ps);
362
363         /*
364          * initialize child expressions
365          */
366         hjstate->js.ps.targetlist = (List *)
367                 ExecInitExpr((Expr *) node->join.plan.targetlist,
368                                          (PlanState *) hjstate);
369         hjstate->js.ps.qual = (List *)
370                 ExecInitExpr((Expr *) node->join.plan.qual,
371                                          (PlanState *) hjstate);
372         hjstate->js.jointype = node->join.jointype;
373         hjstate->js.joinqual = (List *)
374                 ExecInitExpr((Expr *) node->join.joinqual,
375                                          (PlanState *) hjstate);
376         hjstate->hashclauses = (List *)
377                 ExecInitExpr((Expr *) node->hashclauses,
378                                          (PlanState *) hjstate);
379
380         /*
381          * initialize child nodes
382          *
383          * Note: we could suppress the REWIND flag for the inner input, which
384          * would amount to betting that the hash will be a single batch.  Not
385          * clear if this would be a win or not.
386          */
387         outerNode = outerPlan(node);
388         hashNode = (Hash *) innerPlan(node);
389
390         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
391         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
392
393 #define HASHJOIN_NSLOTS 3
394
395         /*
396          * tuple table initialization
397          */
398         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
399         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
400
401         switch (node->join.jointype)
402         {
403                 case JOIN_INNER:
404                 case JOIN_IN:
405                         break;
406                 case JOIN_LEFT:
407                         hjstate->hj_NullInnerTupleSlot =
408                                 ExecInitNullTupleSlot(estate,
409                                                                  ExecGetResultType(innerPlanState(hjstate)));
410                         break;
411                 default:
412                         elog(ERROR, "unrecognized join type: %d",
413                                  (int) node->join.jointype);
414         }
415
416         /*
417          * now for some voodoo.  our temporary tuple slot is actually the result
418          * tuple slot of the Hash node (which is our inner plan).  we do this
419          * because Hash nodes don't return tuples via ExecProcNode() -- instead
420          * the hash join node uses ExecScanHashBucket() to get at the contents of
421          * the hash table.      -cim 6/9/91
422          */
423         {
424                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
425                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
426
427                 hjstate->hj_HashTupleSlot = slot;
428         }
429
430         /*
431          * initialize tuple type and projection info
432          */
433         ExecAssignResultTypeFromTL(&hjstate->js.ps);
434         ExecAssignProjectionInfo(&hjstate->js.ps);
435
436         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
437                                                   ExecGetResultType(outerPlanState(hjstate)));
438
439         /*
440          * initialize hash-specific info
441          */
442         hjstate->hj_HashTable = NULL;
443         hjstate->hj_FirstOuterTupleSlot = NULL;
444
445         hjstate->hj_CurHashValue = 0;
446         hjstate->hj_CurBucketNo = 0;
447         hjstate->hj_CurTuple = NULL;
448
449         /*
450          * Deconstruct the hash clauses into outer and inner argument values, so
451          * that we can evaluate those subexpressions separately.  Also make a list
452          * of the hash operator OIDs, in preparation for looking up the hash
453          * functions to use.
454          */
455         lclauses = NIL;
456         rclauses = NIL;
457         hoperators = NIL;
458         foreach(l, hjstate->hashclauses)
459         {
460                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
461                 OpExpr     *hclause;
462
463                 Assert(IsA(fstate, FuncExprState));
464                 hclause = (OpExpr *) fstate->xprstate.expr;
465                 Assert(IsA(hclause, OpExpr));
466                 lclauses = lappend(lclauses, linitial(fstate->args));
467                 rclauses = lappend(rclauses, lsecond(fstate->args));
468                 hoperators = lappend_oid(hoperators, hclause->opno);
469         }
470         hjstate->hj_OuterHashKeys = lclauses;
471         hjstate->hj_InnerHashKeys = rclauses;
472         hjstate->hj_HashOperators = hoperators;
473         /* child Hash node needs to evaluate inner hash keys, too */
474         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
475
476         hjstate->js.ps.ps_OuterTupleSlot = NULL;
477         hjstate->js.ps.ps_TupFromTlist = false;
478         hjstate->hj_NeedNewOuter = true;
479         hjstate->hj_MatchedOuter = false;
480         hjstate->hj_OuterNotEmpty = false;
481
482         return hjstate;
483 }
484
485 int
486 ExecCountSlotsHashJoin(HashJoin *node)
487 {
488         return ExecCountSlotsNode(outerPlan(node)) +
489                 ExecCountSlotsNode(innerPlan(node)) +
490                 HASHJOIN_NSLOTS;
491 }
492
493 /* ----------------------------------------------------------------
494  *              ExecEndHashJoin
495  *
496  *              clean up routine for HashJoin node
497  * ----------------------------------------------------------------
498  */
499 void
500 ExecEndHashJoin(HashJoinState *node)
501 {
502         /*
503          * Free hash table
504          */
505         if (node->hj_HashTable)
506         {
507                 ExecHashTableDestroy(node->hj_HashTable);
508                 node->hj_HashTable = NULL;
509         }
510
511         /*
512          * Free the exprcontext
513          */
514         ExecFreeExprContext(&node->js.ps);
515
516         /*
517          * clean out the tuple table
518          */
519         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
520         ExecClearTuple(node->hj_OuterTupleSlot);
521         ExecClearTuple(node->hj_HashTupleSlot);
522
523         /*
524          * clean up subtrees
525          */
526         ExecEndNode(outerPlanState(node));
527         ExecEndNode(innerPlanState(node));
528 }
529
530 /*
531  * ExecHashJoinOuterGetTuple
532  *
533  *              get the next outer tuple for hashjoin: either by
534  *              executing a plan node in the first pass, or from
535  *              the temp files for the hashjoin batches.
536  *
537  * Returns a null slot if no more outer tuples.  On success, the tuple's
538  * hash value is stored at *hashvalue --- this is either originally computed,
539  * or re-read from the temp file.
540  */
541 static TupleTableSlot *
542 ExecHashJoinOuterGetTuple(PlanState *outerNode,
543                                                   HashJoinState *hjstate,
544                                                   uint32 *hashvalue)
545 {
546         HashJoinTable hashtable = hjstate->hj_HashTable;
547         int                     curbatch = hashtable->curbatch;
548         TupleTableSlot *slot;
549
550         if (curbatch == 0)
551         {                                                       /* if it is the first pass */
552
553                 /*
554                  * Check to see if first outer tuple was already fetched by
555                  * ExecHashJoin() and not used yet.
556                  */
557                 slot = hjstate->hj_FirstOuterTupleSlot;
558                 if (!TupIsNull(slot))
559                         hjstate->hj_FirstOuterTupleSlot = NULL;
560                 else
561                         slot = ExecProcNode(outerNode);
562                 if (!TupIsNull(slot))
563                 {
564                         /*
565                          * We have to compute the tuple's hash value.
566                          */
567                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
568
569                         econtext->ecxt_outertuple = slot;
570                         *hashvalue = ExecHashGetHashValue(hashtable, econtext,
571                                                                                           hjstate->hj_OuterHashKeys);
572
573                         /* remember outer relation is not empty for possible rescan */
574                         hjstate->hj_OuterNotEmpty = true;
575
576                         return slot;
577                 }
578
579                 /*
580                  * We have just reached the end of the first pass. Try to switch to a
581                  * saved batch.
582                  */
583                 curbatch = ExecHashJoinNewBatch(hjstate);
584         }
585
586         /*
587          * Try to read from a temp file. Loop allows us to advance to new batches
588          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
589          * don't try to optimize this loop.
590          */
591         while (curbatch < hashtable->nbatch)
592         {
593                 slot = ExecHashJoinGetSavedTuple(hjstate,
594                                                                                  hashtable->outerBatchFile[curbatch],
595                                                                                  hashvalue,
596                                                                                  hjstate->hj_OuterTupleSlot);
597                 if (!TupIsNull(slot))
598                         return slot;
599                 curbatch = ExecHashJoinNewBatch(hjstate);
600         }
601
602         /* Out of batches... */
603         return NULL;
604 }
605
606 /*
607  * ExecHashJoinNewBatch
608  *              switch to a new hashjoin batch
609  *
610  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
611  * We will never return a batch number that has an empty outer batch file.
612  */
613 static int
614 ExecHashJoinNewBatch(HashJoinState *hjstate)
615 {
616         HashJoinTable hashtable = hjstate->hj_HashTable;
617         int                     nbatch;
618         int                     curbatch;
619         BufFile    *innerFile;
620         TupleTableSlot *slot;
621         uint32          hashvalue;
622
623 start_over:
624         nbatch = hashtable->nbatch;
625         curbatch = hashtable->curbatch;
626
627         if (curbatch > 0)
628         {
629                 /*
630                  * We no longer need the previous outer batch file; close it right
631                  * away to free disk space.
632                  */
633                 if (hashtable->outerBatchFile[curbatch])
634                         BufFileClose(hashtable->outerBatchFile[curbatch]);
635                 hashtable->outerBatchFile[curbatch] = NULL;
636         }
637
638         /*
639          * We can always skip over any batches that are completely empty on both
640          * sides.  We can sometimes skip over batches that are empty on only one
641          * side, but there are exceptions:
642          *
643          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
644          * batch is empty.
645          *
646          * 2. If we have increased nbatch since the initial estimate, we have to
647          * scan inner batches since they might contain tuples that need to be
648          * reassigned to later inner batches.
649          *
650          * 3. Similarly, if we have increased nbatch since starting the outer
651          * scan, we have to rescan outer batches in case they contain tuples that
652          * need to be reassigned.
653          */
654         curbatch++;
655         while (curbatch < nbatch &&
656                    (hashtable->outerBatchFile[curbatch] == NULL ||
657                         hashtable->innerBatchFile[curbatch] == NULL))
658         {
659                 if (hashtable->outerBatchFile[curbatch] &&
660                         hjstate->js.jointype == JOIN_LEFT)
661                         break;                          /* must process due to rule 1 */
662                 if (hashtable->innerBatchFile[curbatch] &&
663                         nbatch != hashtable->nbatch_original)
664                         break;                          /* must process due to rule 2 */
665                 if (hashtable->outerBatchFile[curbatch] &&
666                         nbatch != hashtable->nbatch_outstart)
667                         break;                          /* must process due to rule 3 */
668                 /* We can ignore this batch. */
669                 /* Release associated temp files right away. */
670                 if (hashtable->innerBatchFile[curbatch])
671                         BufFileClose(hashtable->innerBatchFile[curbatch]);
672                 hashtable->innerBatchFile[curbatch] = NULL;
673                 if (hashtable->outerBatchFile[curbatch])
674                         BufFileClose(hashtable->outerBatchFile[curbatch]);
675                 hashtable->outerBatchFile[curbatch] = NULL;
676                 curbatch++;
677         }
678
679         if (curbatch >= nbatch)
680                 return curbatch;                /* no more batches */
681
682         hashtable->curbatch = curbatch;
683
684         /*
685          * Reload the hash table with the new inner batch (which could be empty)
686          */
687         ExecHashTableReset(hashtable);
688
689         innerFile = hashtable->innerBatchFile[curbatch];
690
691         if (innerFile != NULL)
692         {
693                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
694                         ereport(ERROR,
695                                         (errcode_for_file_access(),
696                                    errmsg("could not rewind hash-join temporary file: %m")));
697
698                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
699                                                                                                  innerFile,
700                                                                                                  &hashvalue,
701                                                                                                  hjstate->hj_HashTupleSlot)))
702                 {
703                         /*
704                          * NOTE: some tuples may be sent to future batches.  Also, it is
705                          * possible for hashtable->nbatch to be increased here!
706                          */
707                         ExecHashTableInsert(hashtable, slot, hashvalue);
708                 }
709
710                 /*
711                  * after we build the hash table, the inner batch file is no longer
712                  * needed
713                  */
714                 BufFileClose(innerFile);
715                 hashtable->innerBatchFile[curbatch] = NULL;
716         }
717
718         /*
719          * If there's no outer batch file, advance to next batch.
720          */
721         if (hashtable->outerBatchFile[curbatch] == NULL)
722                 goto start_over;
723
724         /*
725          * Rewind outer batch file, so that we can start reading it.
726          */
727         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
728                 ereport(ERROR,
729                                 (errcode_for_file_access(),
730                                  errmsg("could not rewind hash-join temporary file: %m")));
731
732         return curbatch;
733 }
734
735 /*
736  * ExecHashJoinSaveTuple
737  *              save a tuple to a batch file.
738  *
739  * The data recorded in the file for each tuple is its hash value,
740  * then the tuple in MinimalTuple format.
741  *
742  * Note: it is important always to call this in the regular executor
743  * context, not in a shorter-lived context; else the temp file buffers
744  * will get messed up.
745  */
746 void
747 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
748                                           BufFile **fileptr)
749 {
750         BufFile    *file = *fileptr;
751         size_t          written;
752
753         if (file == NULL)
754         {
755                 /* First write to this batch file, so open it. */
756                 file = BufFileCreateTemp(false);
757                 *fileptr = file;
758         }
759
760         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
761         if (written != sizeof(uint32))
762                 ereport(ERROR,
763                                 (errcode_for_file_access(),
764                                  errmsg("could not write to hash-join temporary file: %m")));
765
766         written = BufFileWrite(file, (void *) tuple, tuple->t_len);
767         if (written != tuple->t_len)
768                 ereport(ERROR,
769                                 (errcode_for_file_access(),
770                                  errmsg("could not write to hash-join temporary file: %m")));
771 }
772
773 /*
774  * ExecHashJoinGetSavedTuple
775  *              read the next tuple from a batch file.  Return NULL if no more.
776  *
777  * On success, *hashvalue is set to the tuple's hash value, and the tuple
778  * itself is stored in the given slot.
779  */
780 static TupleTableSlot *
781 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
782                                                   BufFile *file,
783                                                   uint32 *hashvalue,
784                                                   TupleTableSlot *tupleSlot)
785 {
786         uint32          header[2];
787         size_t          nread;
788         MinimalTuple tuple;
789
790         /*
791          * Since both the hash value and the MinimalTuple length word are uint32,
792          * we can read them both in one BufFileRead() call without any type
793          * cheating.
794          */
795         nread = BufFileRead(file, (void *) header, sizeof(header));
796         if (nread == 0)                         /* end of file */
797         {
798                 ExecClearTuple(tupleSlot);
799                 return NULL;
800         }
801         if (nread != sizeof(header))
802                 ereport(ERROR,
803                                 (errcode_for_file_access(),
804                                  errmsg("could not read from hash-join temporary file: %m")));
805         *hashvalue = header[0];
806         tuple = (MinimalTuple) palloc(header[1]);
807         tuple->t_len = header[1];
808         nread = BufFileRead(file,
809                                                 (void *) ((char *) tuple + sizeof(uint32)),
810                                                 header[1] - sizeof(uint32));
811         if (nread != header[1] - sizeof(uint32))
812                 ereport(ERROR,
813                                 (errcode_for_file_access(),
814                                  errmsg("could not read from hash-join temporary file: %m")));
815         return ExecStoreMinimalTuple(tuple, tupleSlot, true);
816 }
817
818
819 void
820 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
821 {
822         /*
823          * In a multi-batch join, we currently have to do rescans the hard way,
824          * primarily because batch temp files may have already been released. But
825          * if it's a single-batch join, and there is no parameter change for the
826          * inner subnode, then we can just re-use the existing hash table without
827          * rebuilding it.
828          */
829         if (node->hj_HashTable != NULL)
830         {
831                 if (node->hj_HashTable->nbatch == 1 &&
832                         ((PlanState *) node)->righttree->chgParam == NULL)
833                 {
834                         /*
835                          * okay to reuse the hash table; needn't rescan inner, either.
836                          *
837                          * What we do need to do is reset our state about the emptiness of
838                          * the outer relation, so that the new scan of the outer will
839                          * update it correctly if it turns out to be empty this time.
840                          * (There's no harm in clearing it now because ExecHashJoin won't
841                          * need the info.  In the other cases, where the hash table
842                          * doesn't exist or we are destroying it, we leave this state
843                          * alone because ExecHashJoin will need it the first time
844                          * through.)
845                          */
846                         node->hj_OuterNotEmpty = false;
847                 }
848                 else
849                 {
850                         /* must destroy and rebuild hash table */
851                         ExecHashTableDestroy(node->hj_HashTable);
852                         node->hj_HashTable = NULL;
853
854                         /*
855                          * if chgParam of subnode is not null then plan will be re-scanned
856                          * by first ExecProcNode.
857                          */
858                         if (((PlanState *) node)->righttree->chgParam == NULL)
859                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
860                 }
861         }
862
863         /* Always reset intra-tuple state */
864         node->hj_CurHashValue = 0;
865         node->hj_CurBucketNo = 0;
866         node->hj_CurTuple = NULL;
867
868         node->js.ps.ps_OuterTupleSlot = NULL;
869         node->js.ps.ps_TupFromTlist = false;
870         node->hj_NeedNewOuter = true;
871         node->hj_MatchedOuter = false;
872         node->hj_FirstOuterTupleSlot = NULL;
873
874         /*
875          * if chgParam of subnode is not null then plan will be re-scanned by
876          * first ExecProcNode.
877          */
878         if (((PlanState *) node)->lefttree->chgParam == NULL)
879                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
880 }