]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c
Make sure that hash join's bulk-tuple-transfer loops are interruptible.
[postgresql] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/executor/nodeHashjoin.c
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "access/htup_details.h"
19 #include "executor/executor.h"
20 #include "executor/hashjoin.h"
21 #include "executor/nodeHash.h"
22 #include "executor/nodeHashjoin.h"
23 #include "miscadmin.h"
24 #include "utils/memutils.h"
25
26
27 /*
28  * States of the ExecHashJoin state machine
29  */
30 #define HJ_BUILD_HASHTABLE              1
31 #define HJ_NEED_NEW_OUTER               2
32 #define HJ_SCAN_BUCKET                  3
33 #define HJ_FILL_OUTER_TUPLE             4
34 #define HJ_FILL_INNER_TUPLES    5
35 #define HJ_NEED_NEW_BATCH               6
36
37 /* Returns true if doing null-fill on outer relation */
38 #define HJ_FILL_OUTER(hjstate)  ((hjstate)->hj_NullInnerTupleSlot != NULL)
39 /* Returns true if doing null-fill on inner relation */
40 #define HJ_FILL_INNER(hjstate)  ((hjstate)->hj_NullOuterTupleSlot != NULL)
41
42 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
43                                                   HashJoinState *hjstate,
44                                                   uint32 *hashvalue);
45 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
46                                                   BufFile *file,
47                                                   uint32 *hashvalue,
48                                                   TupleTableSlot *tupleSlot);
49 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
50
51
52 /* ----------------------------------------------------------------
53  *              ExecHashJoin
54  *
55  *              This function implements the Hybrid Hashjoin algorithm.
56  *
57  *              Note: the relation we build hash table on is the "inner"
58  *                        the other one is "outer".
59  * ----------------------------------------------------------------
60  */
61 TupleTableSlot *                                /* return: a tuple or NULL */
62 ExecHashJoin(HashJoinState *node)
63 {
64         PlanState  *outerNode;
65         HashState  *hashNode;
66         List       *joinqual;
67         List       *otherqual;
68         ExprContext *econtext;
69         HashJoinTable hashtable;
70         TupleTableSlot *outerTupleSlot;
71         uint32          hashvalue;
72         int                     batchno;
73
74         /*
75          * get information from HashJoin node
76          */
77         joinqual = node->js.joinqual;
78         otherqual = node->js.ps.qual;
79         hashNode = (HashState *) innerPlanState(node);
80         outerNode = outerPlanState(node);
81         hashtable = node->hj_HashTable;
82         econtext = node->js.ps.ps_ExprContext;
83
84         /*
85          * Reset per-tuple memory context to free any expression evaluation
86          * storage allocated in the previous tuple cycle.
87          */
88         ResetExprContext(econtext);
89
90         /*
91          * run the hash join state machine
92          */
93         for (;;)
94         {
95                 switch (node->hj_JoinState)
96                 {
97                         case HJ_BUILD_HASHTABLE:
98
99                                 /*
100                                  * First time through: build hash table for inner relation.
101                                  */
102                                 Assert(hashtable == NULL);
103
104                                 /*
105                                  * If the outer relation is completely empty, and it's not
106                                  * right/full join, we can quit without building the hash
107                                  * table.  However, for an inner join it is only a win to
108                                  * check this when the outer relation's startup cost is less
109                                  * than the projected cost of building the hash table.
110                                  * Otherwise it's best to build the hash table first and see
111                                  * if the inner relation is empty.  (When it's a left join, we
112                                  * should always make this check, since we aren't going to be
113                                  * able to skip the join on the strength of an empty inner
114                                  * relation anyway.)
115                                  *
116                                  * If we are rescanning the join, we make use of information
117                                  * gained on the previous scan: don't bother to try the
118                                  * prefetch if the previous scan found the outer relation
119                                  * nonempty. This is not 100% reliable since with new
120                                  * parameters the outer relation might yield different
121                                  * results, but it's a good heuristic.
122                                  *
123                                  * The only way to make the check is to try to fetch a tuple
124                                  * from the outer plan node.  If we succeed, we have to stash
125                                  * it away for later consumption by ExecHashJoinOuterGetTuple.
126                                  */
127                                 if (HJ_FILL_INNER(node))
128                                 {
129                                         /* no chance to not build the hash table */
130                                         node->hj_FirstOuterTupleSlot = NULL;
131                                 }
132                                 else if (HJ_FILL_OUTER(node) ||
133                                                  (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
134                                                   !node->hj_OuterNotEmpty))
135                                 {
136                                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
137                                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
138                                         {
139                                                 node->hj_OuterNotEmpty = false;
140                                                 return NULL;
141                                         }
142                                         else
143                                                 node->hj_OuterNotEmpty = true;
144                                 }
145                                 else
146                                         node->hj_FirstOuterTupleSlot = NULL;
147
148                                 /*
149                                  * create the hash table
150                                  */
151                                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
152                                                                                                 node->hj_HashOperators,
153                                                                                                 HJ_FILL_INNER(node));
154                                 node->hj_HashTable = hashtable;
155
156                                 /*
157                                  * execute the Hash node, to build the hash table
158                                  */
159                                 hashNode->hashtable = hashtable;
160                                 (void) MultiExecProcNode((PlanState *) hashNode);
161
162                                 /*
163                                  * If the inner relation is completely empty, and we're not
164                                  * doing a left outer join, we can quit without scanning the
165                                  * outer relation.
166                                  */
167                                 if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
168                                         return NULL;
169
170                                 /*
171                                  * need to remember whether nbatch has increased since we
172                                  * began scanning the outer relation
173                                  */
174                                 hashtable->nbatch_outstart = hashtable->nbatch;
175
176                                 /*
177                                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
178                                  * tuple above, because ExecHashJoinOuterGetTuple will
179                                  * immediately set it again.)
180                                  */
181                                 node->hj_OuterNotEmpty = false;
182
183                                 node->hj_JoinState = HJ_NEED_NEW_OUTER;
184
185                                 /* FALL THRU */
186
187                         case HJ_NEED_NEW_OUTER:
188
189                                 /*
190                                  * We don't have an outer tuple, try to get the next one
191                                  */
192                                 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
193                                                                                                                    node,
194                                                                                                                    &hashvalue);
195                                 if (TupIsNull(outerTupleSlot))
196                                 {
197                                         /* end of batch, or maybe whole join */
198                                         if (HJ_FILL_INNER(node))
199                                         {
200                                                 /* set up to scan for unmatched inner tuples */
201                                                 ExecPrepHashTableForUnmatched(node);
202                                                 node->hj_JoinState = HJ_FILL_INNER_TUPLES;
203                                         }
204                                         else
205                                                 node->hj_JoinState = HJ_NEED_NEW_BATCH;
206                                         continue;
207                                 }
208
209                                 econtext->ecxt_outertuple = outerTupleSlot;
210                                 node->hj_MatchedOuter = false;
211
212                                 /*
213                                  * Find the corresponding bucket for this tuple in the main
214                                  * hash table or skew hash table.
215                                  */
216                                 node->hj_CurHashValue = hashvalue;
217                                 ExecHashGetBucketAndBatch(hashtable, hashvalue,
218                                                                                   &node->hj_CurBucketNo, &batchno);
219                                 node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
220                                                                                                                                  hashvalue);
221                                 node->hj_CurTuple = NULL;
222
223                                 /*
224                                  * The tuple might not belong to the current batch (where
225                                  * "current batch" includes the skew buckets if any).
226                                  */
227                                 if (batchno != hashtable->curbatch &&
228                                         node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
229                                 {
230                                         /*
231                                          * Need to postpone this outer tuple to a later batch.
232                                          * Save it in the corresponding outer-batch file.
233                                          */
234                                         Assert(batchno > hashtable->curbatch);
235                                         ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
236                                                                                   hashvalue,
237                                                                                 &hashtable->outerBatchFile[batchno]);
238                                         /* Loop around, staying in HJ_NEED_NEW_OUTER state */
239                                         continue;
240                                 }
241
242                                 /* OK, let's scan the bucket for matches */
243                                 node->hj_JoinState = HJ_SCAN_BUCKET;
244
245                                 /* FALL THRU */
246
247                         case HJ_SCAN_BUCKET:
248
249                                 /*
250                                  * We check for interrupts here because this corresponds to
251                                  * where we'd fetch a row from a child plan node in other join
252                                  * types.
253                                  */
254                                 CHECK_FOR_INTERRUPTS();
255
256                                 /*
257                                  * Scan the selected hash bucket for matches to current outer
258                                  */
259                                 if (!ExecScanHashBucket(node, econtext))
260                                 {
261                                         /* out of matches; check for possible outer-join fill */
262                                         node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
263                                         continue;
264                                 }
265
266                                 /*
267                                  * We've got a match, but still need to test non-hashed quals.
268                                  * ExecScanHashBucket already set up all the state needed to
269                                  * call ExecQual.
270                                  *
271                                  * If we pass the qual, then save state for next call and have
272                                  * ExecProject form the projection, store it in the tuple
273                                  * table, and return the slot.
274                                  *
275                                  * Only the joinquals determine tuple match status, but all
276                                  * quals must pass to actually return the tuple.
277                                  */
278                                 if (joinqual == NIL || ExecQual(joinqual, econtext, false))
279                                 {
280                                         node->hj_MatchedOuter = true;
281                                         HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
282
283                                         /* In an antijoin, we never return a matched tuple */
284                                         if (node->js.jointype == JOIN_ANTI)
285                                         {
286                                                 node->hj_JoinState = HJ_NEED_NEW_OUTER;
287                                                 continue;
288                                         }
289
290                                         /*
291                                          * In a semijoin, we'll consider returning the first
292                                          * match, but after that we're done with this outer tuple.
293                                          */
294                                         if (node->js.jointype == JOIN_SEMI)
295                                                 node->hj_JoinState = HJ_NEED_NEW_OUTER;
296
297                                         if (otherqual == NIL ||
298                                                 ExecQual(otherqual, econtext, false))
299                                                 return ExecProject(node->js.ps.ps_ProjInfo);
300                                         else
301                                                 InstrCountFiltered2(node, 1);
302                                 }
303                                 else
304                                         InstrCountFiltered1(node, 1);
305                                 break;
306
307                         case HJ_FILL_OUTER_TUPLE:
308
309                                 /*
310                                  * The current outer tuple has run out of matches, so check
311                                  * whether to emit a dummy outer-join tuple.  Whether we emit
312                                  * one or not, the next state is NEED_NEW_OUTER.
313                                  */
314                                 node->hj_JoinState = HJ_NEED_NEW_OUTER;
315
316                                 if (!node->hj_MatchedOuter &&
317                                         HJ_FILL_OUTER(node))
318                                 {
319                                         /*
320                                          * Generate a fake join tuple with nulls for the inner
321                                          * tuple, and return it if it passes the non-join quals.
322                                          */
323                                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
324
325                                         if (otherqual == NIL ||
326                                                 ExecQual(otherqual, econtext, false))
327                                                 return ExecProject(node->js.ps.ps_ProjInfo);
328                                         else
329                                                 InstrCountFiltered2(node, 1);
330                                 }
331                                 break;
332
333                         case HJ_FILL_INNER_TUPLES:
334
335                                 /*
336                                  * We have finished a batch, but we are doing right/full join,
337                                  * so any unmatched inner tuples in the hashtable have to be
338                                  * emitted before we continue to the next batch.
339                                  */
340                                 if (!ExecScanHashTableForUnmatched(node, econtext))
341                                 {
342                                         /* no more unmatched tuples */
343                                         node->hj_JoinState = HJ_NEED_NEW_BATCH;
344                                         continue;
345                                 }
346
347                                 /*
348                                  * Generate a fake join tuple with nulls for the outer tuple,
349                                  * and return it if it passes the non-join quals.
350                                  */
351                                 econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
352
353                                 if (otherqual == NIL ||
354                                         ExecQual(otherqual, econtext, false))
355                                         return ExecProject(node->js.ps.ps_ProjInfo);
356                                 else
357                                         InstrCountFiltered2(node, 1);
358                                 break;
359
360                         case HJ_NEED_NEW_BATCH:
361
362                                 /*
363                                  * Try to advance to next batch.  Done if there are no more.
364                                  */
365                                 if (!ExecHashJoinNewBatch(node))
366                                         return NULL;    /* end of join */
367                                 node->hj_JoinState = HJ_NEED_NEW_OUTER;
368                                 break;
369
370                         default:
371                                 elog(ERROR, "unrecognized hashjoin state: %d",
372                                          (int) node->hj_JoinState);
373                 }
374         }
375 }
376
377 /* ----------------------------------------------------------------
378  *              ExecInitHashJoin
379  *
380  *              Init routine for HashJoin node.
381  * ----------------------------------------------------------------
382  */
383 HashJoinState *
384 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
385 {
386         HashJoinState *hjstate;
387         Plan       *outerNode;
388         Hash       *hashNode;
389         List       *lclauses;
390         List       *rclauses;
391         List       *hoperators;
392         ListCell   *l;
393
394         /* check for unsupported flags */
395         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
396
397         /*
398          * create state structure
399          */
400         hjstate = makeNode(HashJoinState);
401         hjstate->js.ps.plan = (Plan *) node;
402         hjstate->js.ps.state = estate;
403
404         /*
405          * Miscellaneous initialization
406          *
407          * create expression context for node
408          */
409         ExecAssignExprContext(estate, &hjstate->js.ps);
410
411         /*
412          * initialize child expressions
413          */
414         hjstate->js.ps.targetlist = (List *)
415                 ExecInitExpr((Expr *) node->join.plan.targetlist,
416                                          (PlanState *) hjstate);
417         hjstate->js.ps.qual = (List *)
418                 ExecInitExpr((Expr *) node->join.plan.qual,
419                                          (PlanState *) hjstate);
420         hjstate->js.jointype = node->join.jointype;
421         hjstate->js.joinqual = (List *)
422                 ExecInitExpr((Expr *) node->join.joinqual,
423                                          (PlanState *) hjstate);
424         hjstate->hashclauses = (List *)
425                 ExecInitExpr((Expr *) node->hashclauses,
426                                          (PlanState *) hjstate);
427
428         /*
429          * initialize child nodes
430          *
431          * Note: we could suppress the REWIND flag for the inner input, which
432          * would amount to betting that the hash will be a single batch.  Not
433          * clear if this would be a win or not.
434          */
435         outerNode = outerPlan(node);
436         hashNode = (Hash *) innerPlan(node);
437
438         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
439         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
440
441         /*
442          * tuple table initialization
443          */
444         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
445         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
446
447         /* set up null tuples for outer joins, if needed */
448         switch (node->join.jointype)
449         {
450                 case JOIN_INNER:
451                 case JOIN_SEMI:
452                         break;
453                 case JOIN_LEFT:
454                 case JOIN_ANTI:
455                         hjstate->hj_NullInnerTupleSlot =
456                                 ExecInitNullTupleSlot(estate,
457                                                                  ExecGetResultType(innerPlanState(hjstate)));
458                         break;
459                 case JOIN_RIGHT:
460                         hjstate->hj_NullOuterTupleSlot =
461                                 ExecInitNullTupleSlot(estate,
462                                                                  ExecGetResultType(outerPlanState(hjstate)));
463                         break;
464                 case JOIN_FULL:
465                         hjstate->hj_NullOuterTupleSlot =
466                                 ExecInitNullTupleSlot(estate,
467                                                                  ExecGetResultType(outerPlanState(hjstate)));
468                         hjstate->hj_NullInnerTupleSlot =
469                                 ExecInitNullTupleSlot(estate,
470                                                                  ExecGetResultType(innerPlanState(hjstate)));
471                         break;
472                 default:
473                         elog(ERROR, "unrecognized join type: %d",
474                                  (int) node->join.jointype);
475         }
476
477         /*
478          * now for some voodoo.  our temporary tuple slot is actually the result
479          * tuple slot of the Hash node (which is our inner plan).  we can do this
480          * because Hash nodes don't return tuples via ExecProcNode() -- instead
481          * the hash join node uses ExecScanHashBucket() to get at the contents of
482          * the hash table.  -cim 6/9/91
483          */
484         {
485                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
486                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
487
488                 hjstate->hj_HashTupleSlot = slot;
489         }
490
491         /*
492          * initialize tuple type and projection info
493          */
494         ExecAssignResultTypeFromTL(&hjstate->js.ps);
495         ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
496
497         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
498                                                   ExecGetResultType(outerPlanState(hjstate)));
499
500         /*
501          * initialize hash-specific info
502          */
503         hjstate->hj_HashTable = NULL;
504         hjstate->hj_FirstOuterTupleSlot = NULL;
505
506         hjstate->hj_CurHashValue = 0;
507         hjstate->hj_CurBucketNo = 0;
508         hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
509         hjstate->hj_CurTuple = NULL;
510
511         /*
512          * Deconstruct the hash clauses into outer and inner argument values, so
513          * that we can evaluate those subexpressions separately.  Also make a list
514          * of the hash operator OIDs, in preparation for looking up the hash
515          * functions to use.
516          */
517         lclauses = NIL;
518         rclauses = NIL;
519         hoperators = NIL;
520         foreach(l, hjstate->hashclauses)
521         {
522                 FuncExprState *fstate = castNode(FuncExprState, lfirst(l));
523                 OpExpr     *hclause = castNode(OpExpr, fstate->xprstate.expr);
524
525                 lclauses = lappend(lclauses, linitial(fstate->args));
526                 rclauses = lappend(rclauses, lsecond(fstate->args));
527                 hoperators = lappend_oid(hoperators, hclause->opno);
528         }
529         hjstate->hj_OuterHashKeys = lclauses;
530         hjstate->hj_InnerHashKeys = rclauses;
531         hjstate->hj_HashOperators = hoperators;
532         /* child Hash node needs to evaluate inner hash keys, too */
533         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
534
535         hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
536         hjstate->hj_MatchedOuter = false;
537         hjstate->hj_OuterNotEmpty = false;
538
539         return hjstate;
540 }
541
542 /* ----------------------------------------------------------------
543  *              ExecEndHashJoin
544  *
545  *              clean up routine for HashJoin node
546  * ----------------------------------------------------------------
547  */
548 void
549 ExecEndHashJoin(HashJoinState *node)
550 {
551         /*
552          * Free hash table
553          */
554         if (node->hj_HashTable)
555         {
556                 ExecHashTableDestroy(node->hj_HashTable);
557                 node->hj_HashTable = NULL;
558         }
559
560         /*
561          * Free the exprcontext
562          */
563         ExecFreeExprContext(&node->js.ps);
564
565         /*
566          * clean out the tuple table
567          */
568         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
569         ExecClearTuple(node->hj_OuterTupleSlot);
570         ExecClearTuple(node->hj_HashTupleSlot);
571
572         /*
573          * clean up subtrees
574          */
575         ExecEndNode(outerPlanState(node));
576         ExecEndNode(innerPlanState(node));
577 }
578
579 /*
580  * ExecHashJoinOuterGetTuple
581  *
582  *              get the next outer tuple for hashjoin: either by
583  *              executing the outer plan node in the first pass, or from
584  *              the temp files for the hashjoin batches.
585  *
586  * Returns a null slot if no more outer tuples (within the current batch).
587  *
588  * On success, the tuple's hash value is stored at *hashvalue --- this is
589  * either originally computed, or re-read from the temp file.
590  */
591 static TupleTableSlot *
592 ExecHashJoinOuterGetTuple(PlanState *outerNode,
593                                                   HashJoinState *hjstate,
594                                                   uint32 *hashvalue)
595 {
596         HashJoinTable hashtable = hjstate->hj_HashTable;
597         int                     curbatch = hashtable->curbatch;
598         TupleTableSlot *slot;
599
600         if (curbatch == 0)                      /* if it is the first pass */
601         {
602                 /*
603                  * Check to see if first outer tuple was already fetched by
604                  * ExecHashJoin() and not used yet.
605                  */
606                 slot = hjstate->hj_FirstOuterTupleSlot;
607                 if (!TupIsNull(slot))
608                         hjstate->hj_FirstOuterTupleSlot = NULL;
609                 else
610                         slot = ExecProcNode(outerNode);
611
612                 while (!TupIsNull(slot))
613                 {
614                         /*
615                          * We have to compute the tuple's hash value.
616                          */
617                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
618
619                         econtext->ecxt_outertuple = slot;
620                         if (ExecHashGetHashValue(hashtable, econtext,
621                                                                          hjstate->hj_OuterHashKeys,
622                                                                          true,          /* outer tuple */
623                                                                          HJ_FILL_OUTER(hjstate),
624                                                                          hashvalue))
625                         {
626                                 /* remember outer relation is not empty for possible rescan */
627                                 hjstate->hj_OuterNotEmpty = true;
628
629                                 return slot;
630                         }
631
632                         /*
633                          * That tuple couldn't match because of a NULL, so discard it and
634                          * continue with the next one.
635                          */
636                         slot = ExecProcNode(outerNode);
637                 }
638         }
639         else if (curbatch < hashtable->nbatch)
640         {
641                 BufFile    *file = hashtable->outerBatchFile[curbatch];
642
643                 /*
644                  * In outer-join cases, we could get here even though the batch file
645                  * is empty.
646                  */
647                 if (file == NULL)
648                         return NULL;
649
650                 slot = ExecHashJoinGetSavedTuple(hjstate,
651                                                                                  file,
652                                                                                  hashvalue,
653                                                                                  hjstate->hj_OuterTupleSlot);
654                 if (!TupIsNull(slot))
655                         return slot;
656         }
657
658         /* End of this batch */
659         return NULL;
660 }
661
662 /*
663  * ExecHashJoinNewBatch
664  *              switch to a new hashjoin batch
665  *
666  * Returns true if successful, false if there are no more batches.
667  */
668 static bool
669 ExecHashJoinNewBatch(HashJoinState *hjstate)
670 {
671         HashJoinTable hashtable = hjstate->hj_HashTable;
672         int                     nbatch;
673         int                     curbatch;
674         BufFile    *innerFile;
675         TupleTableSlot *slot;
676         uint32          hashvalue;
677
678         nbatch = hashtable->nbatch;
679         curbatch = hashtable->curbatch;
680
681         if (curbatch > 0)
682         {
683                 /*
684                  * We no longer need the previous outer batch file; close it right
685                  * away to free disk space.
686                  */
687                 if (hashtable->outerBatchFile[curbatch])
688                         BufFileClose(hashtable->outerBatchFile[curbatch]);
689                 hashtable->outerBatchFile[curbatch] = NULL;
690         }
691         else    /* we just finished the first batch */
692         {
693                 /*
694                  * Reset some of the skew optimization state variables, since we no
695                  * longer need to consider skew tuples after the first batch. The
696                  * memory context reset we are about to do will release the skew
697                  * hashtable itself.
698                  */
699                 hashtable->skewEnabled = false;
700                 hashtable->skewBucket = NULL;
701                 hashtable->skewBucketNums = NULL;
702                 hashtable->nSkewBuckets = 0;
703                 hashtable->spaceUsedSkew = 0;
704         }
705
706         /*
707          * We can always skip over any batches that are completely empty on both
708          * sides.  We can sometimes skip over batches that are empty on only one
709          * side, but there are exceptions:
710          *
711          * 1. In a left/full outer join, we have to process outer batches even if
712          * the inner batch is empty.  Similarly, in a right/full outer join, we
713          * have to process inner batches even if the outer batch is empty.
714          *
715          * 2. If we have increased nbatch since the initial estimate, we have to
716          * scan inner batches since they might contain tuples that need to be
717          * reassigned to later inner batches.
718          *
719          * 3. Similarly, if we have increased nbatch since starting the outer
720          * scan, we have to rescan outer batches in case they contain tuples that
721          * need to be reassigned.
722          */
723         curbatch++;
724         while (curbatch < nbatch &&
725                    (hashtable->outerBatchFile[curbatch] == NULL ||
726                         hashtable->innerBatchFile[curbatch] == NULL))
727         {
728                 if (hashtable->outerBatchFile[curbatch] &&
729                         HJ_FILL_OUTER(hjstate))
730                         break;                          /* must process due to rule 1 */
731                 if (hashtable->innerBatchFile[curbatch] &&
732                         HJ_FILL_INNER(hjstate))
733                         break;                          /* must process due to rule 1 */
734                 if (hashtable->innerBatchFile[curbatch] &&
735                         nbatch != hashtable->nbatch_original)
736                         break;                          /* must process due to rule 2 */
737                 if (hashtable->outerBatchFile[curbatch] &&
738                         nbatch != hashtable->nbatch_outstart)
739                         break;                          /* must process due to rule 3 */
740                 /* We can ignore this batch. */
741                 /* Release associated temp files right away. */
742                 if (hashtable->innerBatchFile[curbatch])
743                         BufFileClose(hashtable->innerBatchFile[curbatch]);
744                 hashtable->innerBatchFile[curbatch] = NULL;
745                 if (hashtable->outerBatchFile[curbatch])
746                         BufFileClose(hashtable->outerBatchFile[curbatch]);
747                 hashtable->outerBatchFile[curbatch] = NULL;
748                 curbatch++;
749         }
750
751         if (curbatch >= nbatch)
752                 return false;                   /* no more batches */
753
754         hashtable->curbatch = curbatch;
755
756         /*
757          * Reload the hash table with the new inner batch (which could be empty)
758          */
759         ExecHashTableReset(hashtable);
760
761         innerFile = hashtable->innerBatchFile[curbatch];
762
763         if (innerFile != NULL)
764         {
765                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
766                         ereport(ERROR,
767                                         (errcode_for_file_access(),
768                                    errmsg("could not rewind hash-join temporary file: %m")));
769
770                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
771                                                                                                  innerFile,
772                                                                                                  &hashvalue,
773                                                                                                  hjstate->hj_HashTupleSlot)))
774                 {
775                         /*
776                          * NOTE: some tuples may be sent to future batches.  Also, it is
777                          * possible for hashtable->nbatch to be increased here!
778                          */
779                         ExecHashTableInsert(hashtable, slot, hashvalue);
780                 }
781
782                 /*
783                  * after we build the hash table, the inner batch file is no longer
784                  * needed
785                  */
786                 BufFileClose(innerFile);
787                 hashtable->innerBatchFile[curbatch] = NULL;
788         }
789
790         /*
791          * Rewind outer batch file (if present), so that we can start reading it.
792          */
793         if (hashtable->outerBatchFile[curbatch] != NULL)
794         {
795                 if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
796                         ereport(ERROR,
797                                         (errcode_for_file_access(),
798                                    errmsg("could not rewind hash-join temporary file: %m")));
799         }
800
801         return true;
802 }
803
804 /*
805  * ExecHashJoinSaveTuple
806  *              save a tuple to a batch file.
807  *
808  * The data recorded in the file for each tuple is its hash value,
809  * then the tuple in MinimalTuple format.
810  *
811  * Note: it is important always to call this in the regular executor
812  * context, not in a shorter-lived context; else the temp file buffers
813  * will get messed up.
814  */
815 void
816 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
817                                           BufFile **fileptr)
818 {
819         BufFile    *file = *fileptr;
820         size_t          written;
821
822         if (file == NULL)
823         {
824                 /* First write to this batch file, so open it. */
825                 file = BufFileCreateTemp(false);
826                 *fileptr = file;
827         }
828
829         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
830         if (written != sizeof(uint32))
831                 ereport(ERROR,
832                                 (errcode_for_file_access(),
833                                  errmsg("could not write to hash-join temporary file: %m")));
834
835         written = BufFileWrite(file, (void *) tuple, tuple->t_len);
836         if (written != tuple->t_len)
837                 ereport(ERROR,
838                                 (errcode_for_file_access(),
839                                  errmsg("could not write to hash-join temporary file: %m")));
840 }
841
842 /*
843  * ExecHashJoinGetSavedTuple
844  *              read the next tuple from a batch file.  Return NULL if no more.
845  *
846  * On success, *hashvalue is set to the tuple's hash value, and the tuple
847  * itself is stored in the given slot.
848  */
849 static TupleTableSlot *
850 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
851                                                   BufFile *file,
852                                                   uint32 *hashvalue,
853                                                   TupleTableSlot *tupleSlot)
854 {
855         uint32          header[2];
856         size_t          nread;
857         MinimalTuple tuple;
858
859         /*
860          * We check for interrupts here because this is typically taken as an
861          * alternative code path to an ExecProcNode() call, which would include
862          * such a check.
863          */
864         CHECK_FOR_INTERRUPTS();
865
866         /*
867          * Since both the hash value and the MinimalTuple length word are uint32,
868          * we can read them both in one BufFileRead() call without any type
869          * cheating.
870          */
871         nread = BufFileRead(file, (void *) header, sizeof(header));
872         if (nread == 0)                         /* end of file */
873         {
874                 ExecClearTuple(tupleSlot);
875                 return NULL;
876         }
877         if (nread != sizeof(header))
878                 ereport(ERROR,
879                                 (errcode_for_file_access(),
880                                  errmsg("could not read from hash-join temporary file: %m")));
881         *hashvalue = header[0];
882         tuple = (MinimalTuple) palloc(header[1]);
883         tuple->t_len = header[1];
884         nread = BufFileRead(file,
885                                                 (void *) ((char *) tuple + sizeof(uint32)),
886                                                 header[1] - sizeof(uint32));
887         if (nread != header[1] - sizeof(uint32))
888                 ereport(ERROR,
889                                 (errcode_for_file_access(),
890                                  errmsg("could not read from hash-join temporary file: %m")));
891         return ExecStoreMinimalTuple(tuple, tupleSlot, true);
892 }
893
894
895 void
896 ExecReScanHashJoin(HashJoinState *node)
897 {
898         /*
899          * In a multi-batch join, we currently have to do rescans the hard way,
900          * primarily because batch temp files may have already been released. But
901          * if it's a single-batch join, and there is no parameter change for the
902          * inner subnode, then we can just re-use the existing hash table without
903          * rebuilding it.
904          */
905         if (node->hj_HashTable != NULL)
906         {
907                 if (node->hj_HashTable->nbatch == 1 &&
908                         node->js.ps.righttree->chgParam == NULL)
909                 {
910                         /*
911                          * Okay to reuse the hash table; needn't rescan inner, either.
912                          *
913                          * However, if it's a right/full join, we'd better reset the
914                          * inner-tuple match flags contained in the table.
915                          */
916                         if (HJ_FILL_INNER(node))
917                                 ExecHashTableResetMatchFlags(node->hj_HashTable);
918
919                         /*
920                          * Also, we need to reset our state about the emptiness of the
921                          * outer relation, so that the new scan of the outer will update
922                          * it correctly if it turns out to be empty this time. (There's no
923                          * harm in clearing it now because ExecHashJoin won't need the
924                          * info.  In the other cases, where the hash table doesn't exist
925                          * or we are destroying it, we leave this state alone because
926                          * ExecHashJoin will need it the first time through.)
927                          */
928                         node->hj_OuterNotEmpty = false;
929
930                         /* ExecHashJoin can skip the BUILD_HASHTABLE step */
931                         node->hj_JoinState = HJ_NEED_NEW_OUTER;
932                 }
933                 else
934                 {
935                         /* must destroy and rebuild hash table */
936                         ExecHashTableDestroy(node->hj_HashTable);
937                         node->hj_HashTable = NULL;
938                         node->hj_JoinState = HJ_BUILD_HASHTABLE;
939
940                         /*
941                          * if chgParam of subnode is not null then plan will be re-scanned
942                          * by first ExecProcNode.
943                          */
944                         if (node->js.ps.righttree->chgParam == NULL)
945                                 ExecReScan(node->js.ps.righttree);
946                 }
947         }
948
949         /* Always reset intra-tuple state */
950         node->hj_CurHashValue = 0;
951         node->hj_CurBucketNo = 0;
952         node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
953         node->hj_CurTuple = NULL;
954
955         node->hj_MatchedOuter = false;
956         node->hj_FirstOuterTupleSlot = NULL;
957
958         /*
959          * if chgParam of subnode is not null then plan will be re-scanned by
960          * first ExecProcNode.
961          */
962         if (node->js.ps.lefttree->chgParam == NULL)
963                 ExecReScan(node->js.ps.lefttree);
964 }