granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nodeHashjoin.c
   4  *        Routines to handle hash join nodes
   5  *
   6  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.81 2006/03/05 15:58:26 momjian Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "executor/executor.h"
  19 #include "executor/hashjoin.h"
  20 #include "executor/nodeHash.h"
  21 #include "executor/nodeHashjoin.h"
  22 #include "optimizer/clauses.h"
  23 #include "utils/memutils.h"
  24
  25
  26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
  27                                                   HashJoinState *hjstate,
  28                                                   uint32 *hashvalue);
  29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
  30                                                   BufFile *file,
  31                                                   uint32 *hashvalue,
  32                                                   TupleTableSlot *tupleSlot);
  33 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
  34
  35
  36 /* ----------------------------------------------------------------
  37  *              ExecHashJoin
  38  *
  39  *              This function implements the Hybrid Hashjoin algorithm.
  40  *
  41  *              Note: the relation we build hash table on is the "inner"
  42  *                        the other one is "outer".
  43  * ----------------------------------------------------------------
  44  */
  45 TupleTableSlot *                                /* return: a tuple or NULL */
  46 ExecHashJoin(HashJoinState *node)
  47 {
  48         EState     *estate;
  49         PlanState  *outerNode;
  50         HashState  *hashNode;
  51         List       *joinqual;
  52         List       *otherqual;
  53         TupleTableSlot *inntuple;
  54         ExprContext *econtext;
  55         ExprDoneCond isDone;
  56         HashJoinTable hashtable;
  57         HeapTuple       curtuple;
  58         TupleTableSlot *outerTupleSlot;
  59         uint32          hashvalue;
  60         int                     batchno;
  61
  62         /*
  63          * get information from HashJoin node
  64          */
  65         estate = node->js.ps.state;
  66         joinqual = node->js.joinqual;
  67         otherqual = node->js.ps.qual;
  68         hashNode = (HashState *) innerPlanState(node);
  69         outerNode = outerPlanState(node);
  70
  71         /*
  72          * get information from HashJoin state
  73          */
  74         hashtable = node->hj_HashTable;
  75         econtext = node->js.ps.ps_ExprContext;
  76
  77         /*
  78          * Check to see if we're still projecting out tuples from a previous join
  79          * tuple (because there is a function-returning-set in the projection
  80          * expressions).  If so, try to project another one.
  81          */
  82         if (node->js.ps.ps_TupFromTlist)
  83         {
  84                 TupleTableSlot *result;
  85
  86                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
  87                 if (isDone == ExprMultipleResult)
  88                         return result;
  89                 /* Done with that source tuple... */
  90                 node->js.ps.ps_TupFromTlist = false;
  91         }
  92
  93         /*
  94          * If we're doing an IN join, we want to return at most one row per outer
  95          * tuple; so we can stop scanning the inner scan if we matched on the
  96          * previous try.
  97          */
  98         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
  99                 node->hj_NeedNewOuter = true;
 100
 101         /*
 102          * Reset per-tuple memory context to free any expression evaluation
 103          * storage allocated in the previous tuple cycle.  Note this can't happen
 104          * until we're done projecting out tuples from a join tuple.
 105          */
 106         ResetExprContext(econtext);
 107
 108         /*
 109          * if this is the first call, build the hash table for inner relation
 110          */
 111         if (hashtable == NULL)
 112         {
 113                 /*
 114                  * If the outer relation is completely empty, we can quit without
 115                  * building the hash table.  However, for an inner join it is only a
 116                  * win to check this when the outer relation's startup cost is less
 117                  * than the projected cost of building the hash table.  Otherwise it's
 118                  * best to build the hash table first and see if the inner relation is
 119                  * empty.  (When it's an outer join, we should always make this check,
 120                  * since we aren't going to be able to skip the join on the strength
 121                  * of an empty inner relation anyway.)
 122                  *
 123                  * If we are rescanning the join, we make use of information gained
 124                  * on the previous scan: don't bother to try the prefetch if the
 125                  * previous scan found the outer relation nonempty.  This is not
 126                  * 100% reliable since with new parameters the outer relation might
 127                  * yield different results, but it's a good heuristic.
 128                  *
 129                  * The only way to make the check is to try to fetch a tuple from the
 130                  * outer plan node.  If we succeed, we have to stash it away for later
 131                  * consumption by ExecHashJoinOuterGetTuple.
 132                  */
 133                 if (node->js.jointype == JOIN_LEFT ||
 134                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
 135                          !node->hj_OuterNotEmpty))
 136                 {
 137                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
 138                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
 139                         {
 140                                 node->hj_OuterNotEmpty = false;
 141                                 return NULL;
 142                         }
 143                         else
 144                                 node->hj_OuterNotEmpty = true;
 145                 }
 146                 else
 147                         node->hj_FirstOuterTupleSlot = NULL;
 148
 149                 /*
 150                  * create the hash table
 151                  */
 152                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
 153                                                                                 node->hj_HashOperators);
 154                 node->hj_HashTable = hashtable;
 155
 156                 /*
 157                  * execute the Hash node, to build the hash table
 158                  */
 159                 hashNode->hashtable = hashtable;
 160                 (void) MultiExecProcNode((PlanState *) hashNode);
 161
 162                 /*
 163                  * If the inner relation is completely empty, and we're not doing an
 164                  * outer join, we can quit without scanning the outer relation.
 165                  */
 166                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
 167                         return NULL;
 168
 169                 /*
 170                  * need to remember whether nbatch has increased since we began
 171                  * scanning the outer relation
 172                  */
 173                 hashtable->nbatch_outstart = hashtable->nbatch;
 174
 175                 /*
 176                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
 177                  * above, because ExecHashJoinOuterGetTuple will immediately
 178                  * set it again.)
 179                  */
 180                 node->hj_OuterNotEmpty = false;
 181         }
 182
 183         /*
 184          * run the hash join process
 185          */
 186         for (;;)
 187         {
 188                 /*
 189                  * If we don't have an outer tuple, get the next one
 190                  */
 191                 if (node->hj_NeedNewOuter)
 192                 {
 193                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
 194                                                                                                            node,
 195                                                                                                            &hashvalue);
 196                         if (TupIsNull(outerTupleSlot))
 197                         {
 198                                 /* end of join */
 199                                 return NULL;
 200                         }
 201
 202                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
 203                         econtext->ecxt_outertuple = outerTupleSlot;
 204                         node->hj_NeedNewOuter = false;
 205                         node->hj_MatchedOuter = false;
 206
 207                         /*
 208                          * now we have an outer tuple, find the corresponding bucket for
 209                          * this tuple from the hash table
 210                          */
 211                         node->hj_CurHashValue = hashvalue;
 212                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
 213                                                                           &node->hj_CurBucketNo, &batchno);
 214                         node->hj_CurTuple = NULL;
 215
 216                         /*
 217                          * Now we've got an outer tuple and the corresponding hash bucket,
 218                          * but this tuple may not belong to the current batch.
 219                          */
 220                         if (batchno != hashtable->curbatch)
 221                         {
 222                                 /*
 223                                  * Need to postpone this outer tuple to a later batch. Save it
 224                                  * in the corresponding outer-batch file.
 225                                  */
 226                                 Assert(batchno > hashtable->curbatch);
 227                                 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
 228                                                                           hashvalue,
 229                                                                           &hashtable->outerBatchFile[batchno]);
 230                                 node->hj_NeedNewOuter = true;
 231                                 continue;               /* loop around for a new outer tuple */
 232                         }
 233                 }
 234
 235                 /*
 236                  * OK, scan the selected hash bucket for matches
 237                  */
 238                 for (;;)
 239                 {
 240                         curtuple = ExecScanHashBucket(node, econtext);
 241                         if (curtuple == NULL)
 242                                 break;                  /* out of matches */
 243
 244                         /*
 245                          * we've got a match, but still need to test non-hashed quals
 246                          */
 247                         inntuple = ExecStoreTuple(curtuple,
 248                                                                           node->hj_HashTupleSlot,
 249                                                                           InvalidBuffer,
 250                                                                           false);       /* don't pfree this tuple */
 251                         econtext->ecxt_innertuple = inntuple;
 252
 253                         /* reset temp memory each time to avoid leaks from qual expr */
 254                         ResetExprContext(econtext);
 255
 256                         /*
 257                          * if we pass the qual, then save state for next call and have
 258                          * ExecProject form the projection, store it in the tuple table,
 259                          * and return the slot.
 260                          *
 261                          * Only the joinquals determine MatchedOuter status, but all quals
 262                          * must pass to actually return the tuple.
 263                          */
 264                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
 265                         {
 266                                 node->hj_MatchedOuter = true;
 267
 268                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
 269                                 {
 270                                         TupleTableSlot *result;
 271
 272                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 273
 274                                         if (isDone != ExprEndResult)
 275                                         {
 276                                                 node->js.ps.ps_TupFromTlist =
 277                                                         (isDone == ExprMultipleResult);
 278                                                 return result;
 279                                         }
 280                                 }
 281
 282                                 /*
 283                                  * If we didn't return a tuple, may need to set NeedNewOuter
 284                                  */
 285                                 if (node->js.jointype == JOIN_IN)
 286                                 {
 287                                         node->hj_NeedNewOuter = true;
 288                                         break;          /* out of loop over hash bucket */
 289                                 }
 290                         }
 291                 }
 292
 293                 /*
 294                  * Now the current outer tuple has run out of matches, so check
 295                  * whether to emit a dummy outer-join tuple. If not, loop around to
 296                  * get a new outer tuple.
 297                  */
 298                 node->hj_NeedNewOuter = true;
 299
 300                 if (!node->hj_MatchedOuter &&
 301                         node->js.jointype == JOIN_LEFT)
 302                 {
 303                         /*
 304                          * We are doing an outer join and there were no join matches for
 305                          * this outer tuple.  Generate a fake join tuple with nulls for
 306                          * the inner tuple, and return it if it passes the non-join quals.
 307                          */
 308                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
 309
 310                         if (ExecQual(otherqual, econtext, false))
 311                         {
 312                                 /*
 313                                  * qualification was satisfied so we project and return the
 314                                  * slot containing the result tuple using ExecProject().
 315                                  */
 316                                 TupleTableSlot *result;
 317
 318                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 319
 320                                 if (isDone != ExprEndResult)
 321                                 {
 322                                         node->js.ps.ps_TupFromTlist =
 323                                                 (isDone == ExprMultipleResult);
 324                                         return result;
 325                                 }
 326                         }
 327                 }
 328         }
 329 }
 330
 331 /* ----------------------------------------------------------------
 332  *              ExecInitHashJoin
 333  *
 334  *              Init routine for HashJoin node.
 335  * ----------------------------------------------------------------
 336  */
 337 HashJoinState *
 338 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 339 {
 340         HashJoinState *hjstate;
 341         Plan       *outerNode;
 342         Hash       *hashNode;
 343         List       *lclauses;
 344         List       *rclauses;
 345         List       *hoperators;
 346         ListCell   *l;
 347
 348         /* check for unsupported flags */
 349         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 350
 351         /*
 352          * create state structure
 353          */
 354         hjstate = makeNode(HashJoinState);
 355         hjstate->js.ps.plan = (Plan *) node;
 356         hjstate->js.ps.state = estate;
 357
 358         /*
 359          * Miscellaneous initialization
 360          *
 361          * create expression context for node
 362          */
 363         ExecAssignExprContext(estate, &hjstate->js.ps);
 364
 365         /*
 366          * initialize child expressions
 367          */
 368         hjstate->js.ps.targetlist = (List *)
 369                 ExecInitExpr((Expr *) node->join.plan.targetlist,
 370                                          (PlanState *) hjstate);
 371         hjstate->js.ps.qual = (List *)
 372                 ExecInitExpr((Expr *) node->join.plan.qual,
 373                                          (PlanState *) hjstate);
 374         hjstate->js.jointype = node->join.jointype;
 375         hjstate->js.joinqual = (List *)
 376                 ExecInitExpr((Expr *) node->join.joinqual,
 377                                          (PlanState *) hjstate);
 378         hjstate->hashclauses = (List *)
 379                 ExecInitExpr((Expr *) node->hashclauses,
 380                                          (PlanState *) hjstate);
 381
 382         /*
 383          * initialize child nodes
 384          *
 385          * Note: we could suppress the REWIND flag for the inner input, which
 386          * would amount to betting that the hash will be a single batch.  Not
 387          * clear if this would be a win or not.
 388          */
 389         outerNode = outerPlan(node);
 390         hashNode = (Hash *) innerPlan(node);
 391
 392         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
 393         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
 394
 395 #define HASHJOIN_NSLOTS 3
 396
 397         /*
 398          * tuple table initialization
 399          */
 400         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
 401         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
 402
 403         switch (node->join.jointype)
 404         {
 405                 case JOIN_INNER:
 406                 case JOIN_IN:
 407                         break;
 408                 case JOIN_LEFT:
 409                         hjstate->hj_NullInnerTupleSlot =
 410                                 ExecInitNullTupleSlot(estate,
 411                                                                  ExecGetResultType(innerPlanState(hjstate)));
 412                         break;
 413                 default:
 414                         elog(ERROR, "unrecognized join type: %d",
 415                                  (int) node->join.jointype);
 416         }
 417
 418         /*
 419          * now for some voodoo.  our temporary tuple slot is actually the result
 420          * tuple slot of the Hash node (which is our inner plan).  we do this
 421          * because Hash nodes don't return tuples via ExecProcNode() -- instead
 422          * the hash join node uses ExecScanHashBucket() to get at the contents of
 423          * the hash table.      -cim 6/9/91
 424          */
 425         {
 426                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
 427                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
 428
 429                 hjstate->hj_HashTupleSlot = slot;
 430         }
 431
 432         /*
 433          * initialize tuple type and projection info
 434          */
 435         ExecAssignResultTypeFromTL(&hjstate->js.ps);
 436         ExecAssignProjectionInfo(&hjstate->js.ps);
 437
 438         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
 439                                                   ExecGetResultType(outerPlanState(hjstate)),
 440                                                   false);
 441
 442         /*
 443          * initialize hash-specific info
 444          */
 445         hjstate->hj_HashTable = NULL;
 446         hjstate->hj_FirstOuterTupleSlot = NULL;
 447
 448         hjstate->hj_CurHashValue = 0;
 449         hjstate->hj_CurBucketNo = 0;
 450         hjstate->hj_CurTuple = NULL;
 451
 452         /*
 453          * Deconstruct the hash clauses into outer and inner argument values, so
 454          * that we can evaluate those subexpressions separately.  Also make a list
 455          * of the hash operator OIDs, in preparation for looking up the hash
 456          * functions to use.
 457          */
 458         lclauses = NIL;
 459         rclauses = NIL;
 460         hoperators = NIL;
 461         foreach(l, hjstate->hashclauses)
 462         {
 463                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
 464                 OpExpr     *hclause;
 465
 466                 Assert(IsA(fstate, FuncExprState));
 467                 hclause = (OpExpr *) fstate->xprstate.expr;
 468                 Assert(IsA(hclause, OpExpr));
 469                 lclauses = lappend(lclauses, linitial(fstate->args));
 470                 rclauses = lappend(rclauses, lsecond(fstate->args));
 471                 hoperators = lappend_oid(hoperators, hclause->opno);
 472         }
 473         hjstate->hj_OuterHashKeys = lclauses;
 474         hjstate->hj_InnerHashKeys = rclauses;
 475         hjstate->hj_HashOperators = hoperators;
 476         /* child Hash node needs to evaluate inner hash keys, too */
 477         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
 478
 479         hjstate->js.ps.ps_OuterTupleSlot = NULL;
 480         hjstate->js.ps.ps_TupFromTlist = false;
 481         hjstate->hj_NeedNewOuter = true;
 482         hjstate->hj_MatchedOuter = false;
 483         hjstate->hj_OuterNotEmpty = false;
 484
 485         return hjstate;
 486 }
 487
 488 int
 489 ExecCountSlotsHashJoin(HashJoin *node)
 490 {
 491         return ExecCountSlotsNode(outerPlan(node)) +
 492                 ExecCountSlotsNode(innerPlan(node)) +
 493                 HASHJOIN_NSLOTS;
 494 }
 495
 496 /* ----------------------------------------------------------------
 497  *              ExecEndHashJoin
 498  *
 499  *              clean up routine for HashJoin node
 500  * ----------------------------------------------------------------
 501  */
 502 void
 503 ExecEndHashJoin(HashJoinState *node)
 504 {
 505         /*
 506          * Free hash table
 507          */
 508         if (node->hj_HashTable)
 509         {
 510                 ExecHashTableDestroy(node->hj_HashTable);
 511                 node->hj_HashTable = NULL;
 512         }
 513
 514         /*
 515          * Free the exprcontext
 516          */
 517         ExecFreeExprContext(&node->js.ps);
 518
 519         /*
 520          * clean out the tuple table
 521          */
 522         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
 523         ExecClearTuple(node->hj_OuterTupleSlot);
 524         ExecClearTuple(node->hj_HashTupleSlot);
 525
 526         /*
 527          * clean up subtrees
 528          */
 529         ExecEndNode(outerPlanState(node));
 530         ExecEndNode(innerPlanState(node));
 531 }
 532
 533 /*
 534  * ExecHashJoinOuterGetTuple
 535  *
 536  *              get the next outer tuple for hashjoin: either by
 537  *              executing a plan node in the first pass, or from
 538  *              the temp files for the hashjoin batches.
 539  *
 540  * Returns a null slot if no more outer tuples.  On success, the tuple's
 541  * hash value is stored at *hashvalue --- this is either originally computed,
 542  * or re-read from the temp file.
 543  */
 544 static TupleTableSlot *
 545 ExecHashJoinOuterGetTuple(PlanState *outerNode,
 546                                                   HashJoinState *hjstate,
 547                                                   uint32 *hashvalue)
 548 {
 549         HashJoinTable hashtable = hjstate->hj_HashTable;
 550         int                     curbatch = hashtable->curbatch;
 551         TupleTableSlot *slot;
 552
 553         if (curbatch == 0)
 554         {                                                       /* if it is the first pass */
 555
 556                 /*
 557                  * Check to see if first outer tuple was already fetched by
 558                  * ExecHashJoin() and not used yet.
 559                  */
 560                 slot = hjstate->hj_FirstOuterTupleSlot;
 561                 if (!TupIsNull(slot))
 562                         hjstate->hj_FirstOuterTupleSlot = NULL;
 563                 else
 564                         slot = ExecProcNode(outerNode);
 565                 if (!TupIsNull(slot))
 566                 {
 567                         /*
 568                          * We have to compute the tuple's hash value.
 569                          */
 570                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
 571
 572                         econtext->ecxt_outertuple = slot;
 573                         *hashvalue = ExecHashGetHashValue(hashtable, econtext,
 574                                                                                           hjstate->hj_OuterHashKeys);
 575
 576                         /* remember outer relation is not empty for possible rescan */
 577                         hjstate->hj_OuterNotEmpty = true;
 578
 579                         return slot;
 580                 }
 581
 582                 /*
 583                  * We have just reached the end of the first pass. Try to switch to a
 584                  * saved batch.
 585                  */
 586                 curbatch = ExecHashJoinNewBatch(hjstate);
 587         }
 588
 589         /*
 590          * Try to read from a temp file. Loop allows us to advance to new batches
 591          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
 592          * don't try to optimize this loop.
 593          */
 594         while (curbatch < hashtable->nbatch)
 595         {
 596                 slot = ExecHashJoinGetSavedTuple(hjstate,
 597                                                                                  hashtable->outerBatchFile[curbatch],
 598                                                                                  hashvalue,
 599                                                                                  hjstate->hj_OuterTupleSlot);
 600                 if (!TupIsNull(slot))
 601                         return slot;
 602                 curbatch = ExecHashJoinNewBatch(hjstate);
 603         }
 604
 605         /* Out of batches... */
 606         return NULL;
 607 }
 608
 609 /*
 610  * ExecHashJoinNewBatch
 611  *              switch to a new hashjoin batch
 612  *
 613  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 614  * We will never return a batch number that has an empty outer batch file.
 615  */
 616 static int
 617 ExecHashJoinNewBatch(HashJoinState *hjstate)
 618 {
 619         HashJoinTable hashtable = hjstate->hj_HashTable;
 620         int                     nbatch;
 621         int                     curbatch;
 622         BufFile    *innerFile;
 623         TupleTableSlot *slot;
 624         uint32          hashvalue;
 625
 626 start_over:
 627         nbatch = hashtable->nbatch;
 628         curbatch = hashtable->curbatch;
 629
 630         if (curbatch > 0)
 631         {
 632                 /*
 633                  * We no longer need the previous outer batch file; close it right
 634                  * away to free disk space.
 635                  */
 636                 if (hashtable->outerBatchFile[curbatch])
 637                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 638                 hashtable->outerBatchFile[curbatch] = NULL;
 639         }
 640
 641         /*
 642          * We can always skip over any batches that are completely empty on both
 643          * sides.  We can sometimes skip over batches that are empty on only one
 644          * side, but there are exceptions:
 645          *
 646          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
 647          * batch is empty.
 648          *
 649          * 2. If we have increased nbatch since the initial estimate, we have to
 650          * scan inner batches since they might contain tuples that need to be
 651          * reassigned to later inner batches.
 652          *
 653          * 3. Similarly, if we have increased nbatch since starting the outer
 654          * scan, we have to rescan outer batches in case they contain tuples that
 655          * need to be reassigned.
 656          */
 657         curbatch++;
 658         while (curbatch < nbatch &&
 659                    (hashtable->outerBatchFile[curbatch] == NULL ||
 660                         hashtable->innerBatchFile[curbatch] == NULL))
 661         {
 662                 if (hashtable->outerBatchFile[curbatch] &&
 663                         hjstate->js.jointype == JOIN_LEFT)
 664                         break;                          /* must process due to rule 1 */
 665                 if (hashtable->innerBatchFile[curbatch] &&
 666                         nbatch != hashtable->nbatch_original)
 667                         break;                          /* must process due to rule 2 */
 668                 if (hashtable->outerBatchFile[curbatch] &&
 669                         nbatch != hashtable->nbatch_outstart)
 670                         break;                          /* must process due to rule 3 */
 671                 /* We can ignore this batch. */
 672                 /* Release associated temp files right away. */
 673                 if (hashtable->innerBatchFile[curbatch])
 674                         BufFileClose(hashtable->innerBatchFile[curbatch]);
 675                 hashtable->innerBatchFile[curbatch] = NULL;
 676                 if (hashtable->outerBatchFile[curbatch])
 677                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 678                 hashtable->outerBatchFile[curbatch] = NULL;
 679                 curbatch++;
 680         }
 681
 682         if (curbatch >= nbatch)
 683                 return curbatch;                /* no more batches */
 684
 685         hashtable->curbatch = curbatch;
 686
 687         /*
 688          * Reload the hash table with the new inner batch (which could be empty)
 689          */
 690         ExecHashTableReset(hashtable);
 691
 692         innerFile = hashtable->innerBatchFile[curbatch];
 693
 694         if (innerFile != NULL)
 695         {
 696                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
 697                         ereport(ERROR,
 698                                         (errcode_for_file_access(),
 699                                    errmsg("could not rewind hash-join temporary file: %m")));
 700
 701                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
 702                                                                                                  innerFile,
 703                                                                                                  &hashvalue,
 704                                                                                                  hjstate->hj_HashTupleSlot)))
 705                 {
 706                         /*
 707                          * NOTE: some tuples may be sent to future batches.  Also, it is
 708                          * possible for hashtable->nbatch to be increased here!
 709                          */
 710                         ExecHashTableInsert(hashtable,
 711                                                                 ExecFetchSlotTuple(slot),
 712                                                                 hashvalue);
 713                 }
 714
 715                 /*
 716                  * after we build the hash table, the inner batch file is no longer
 717                  * needed
 718                  */
 719                 BufFileClose(innerFile);
 720                 hashtable->innerBatchFile[curbatch] = NULL;
 721         }
 722
 723         /*
 724          * If there's no outer batch file, advance to next batch.
 725          */
 726         if (hashtable->outerBatchFile[curbatch] == NULL)
 727                 goto start_over;
 728
 729         /*
 730          * Rewind outer batch file, so that we can start reading it.
 731          */
 732         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
 733                 ereport(ERROR,
 734                                 (errcode_for_file_access(),
 735                                  errmsg("could not rewind hash-join temporary file: %m")));
 736
 737         return curbatch;
 738 }
 739
 740 /*
 741  * ExecHashJoinSaveTuple
 742  *              save a tuple to a batch file.
 743  *
 744  * The data recorded in the file for each tuple is its hash value,
 745  * then an image of its HeapTupleData (with meaningless t_data pointer)
 746  * followed by the HeapTupleHeader and tuple data.
 747  *
 748  * Note: it is important always to call this in the regular executor
 749  * context, not in a shorter-lived context; else the temp file buffers
 750  * will get messed up.
 751  */
 752 void
 753 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
 754                                           BufFile **fileptr)
 755 {
 756         BufFile    *file = *fileptr;
 757         size_t          written;
 758
 759         if (file == NULL)
 760         {
 761                 /* First write to this batch file, so open it. */
 762                 file = BufFileCreateTemp(false);
 763                 *fileptr = file;
 764         }
 765
 766         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
 767         if (written != sizeof(uint32))
 768                 ereport(ERROR,
 769                                 (errcode_for_file_access(),
 770                                  errmsg("could not write to hash-join temporary file: %m")));
 771
 772         written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
 773         if (written != sizeof(HeapTupleData))
 774                 ereport(ERROR,
 775                                 (errcode_for_file_access(),
 776                                  errmsg("could not write to hash-join temporary file: %m")));
 777
 778         written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
 779         if (written != (size_t) heapTuple->t_len)
 780                 ereport(ERROR,
 781                                 (errcode_for_file_access(),
 782                                  errmsg("could not write to hash-join temporary file: %m")));
 783 }
 784
 785 /*
 786  * ExecHashJoinGetSavedTuple
 787  *              read the next tuple from a batch file.  Return NULL if no more.
 788  *
 789  * On success, *hashvalue is set to the tuple's hash value, and the tuple
 790  * itself is stored in the given slot.
 791  */
 792 static TupleTableSlot *
 793 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 794                                                   BufFile *file,
 795                                                   uint32 *hashvalue,
 796                                                   TupleTableSlot *tupleSlot)
 797 {
 798         HeapTupleData htup;
 799         size_t          nread;
 800         HeapTuple       heapTuple;
 801
 802         nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
 803         if (nread == 0)
 804                 return NULL;                    /* end of file */
 805         if (nread != sizeof(uint32))
 806                 ereport(ERROR,
 807                                 (errcode_for_file_access(),
 808                                  errmsg("could not read from hash-join temporary file: %m")));
 809         nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
 810         if (nread != sizeof(HeapTupleData))
 811                 ereport(ERROR,
 812                                 (errcode_for_file_access(),
 813                                  errmsg("could not read from hash-join temporary file: %m")));
 814         heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
 815         memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
 816         heapTuple->t_data = (HeapTupleHeader)
 817                 ((char *) heapTuple + HEAPTUPLESIZE);
 818         nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
 819         if (nread != (size_t) htup.t_len)
 820                 ereport(ERROR,
 821                                 (errcode_for_file_access(),
 822                                  errmsg("could not read from hash-join temporary file: %m")));
 823         return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
 824 }
 825
 826
 827 void
 828 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
 829 {
 830         /*
 831          * In a multi-batch join, we currently have to do rescans the hard way,
 832          * primarily because batch temp files may have already been released. But
 833          * if it's a single-batch join, and there is no parameter change for the
 834          * inner subnode, then we can just re-use the existing hash table without
 835          * rebuilding it.
 836          */
 837         if (node->hj_HashTable != NULL)
 838         {
 839                 if (node->hj_HashTable->nbatch == 1 &&
 840                         ((PlanState *) node)->righttree->chgParam == NULL)
 841                 {
 842                         /*
 843                          * okay to reuse the hash table; needn't rescan inner, either.
 844                          *
 845                          * What we do need to do is reset our state about the emptiness
 846                          * of the outer relation, so that the new scan of the outer will
 847                          * update it correctly if it turns out to be empty this time.
 848                          * (There's no harm in clearing it now because ExecHashJoin won't
 849                          * need the info.  In the other cases, where the hash table
 850                          * doesn't exist or we are destroying it, we leave this state
 851                          * alone because ExecHashJoin will need it the first time
 852                          * through.)
 853                          */
 854                         node->hj_OuterNotEmpty = false;
 855                 }
 856                 else
 857                 {
 858                         /* must destroy and rebuild hash table */
 859                         ExecHashTableDestroy(node->hj_HashTable);
 860                         node->hj_HashTable = NULL;
 861
 862                         /*
 863                          * if chgParam of subnode is not null then plan will be re-scanned
 864                          * by first ExecProcNode.
 865                          */
 866                         if (((PlanState *) node)->righttree->chgParam == NULL)
 867                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
 868                 }
 869         }
 870
 871         /* Always reset intra-tuple state */
 872         node->hj_CurHashValue = 0;
 873         node->hj_CurBucketNo = 0;
 874         node->hj_CurTuple = NULL;
 875
 876         node->js.ps.ps_OuterTupleSlot = NULL;
 877         node->js.ps.ps_TupFromTlist = false;
 878         node->hj_NeedNewOuter = true;
 879         node->hj_MatchedOuter = false;
 880         node->hj_FirstOuterTupleSlot = NULL;
 881
 882         /*
 883          * if chgParam of subnode is not null then plan will be re-scanned by
 884          * first ExecProcNode.
 885          */
 886         if (((PlanState *) node)->lefttree->chgParam == NULL)
 887                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
 888 }