granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHashjoin.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nodeHashjoin.c
   4  *        Routines to handle hash join nodes
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.98 2009/03/21 00:04:38 tgl Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "executor/executor.h"
  19 #include "executor/hashjoin.h"
  20 #include "executor/nodeHash.h"
  21 #include "executor/nodeHashjoin.h"
  22 #include "utils/memutils.h"
  23
  24
  25 /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
  26 #define HASHJOIN_IS_OUTER(hjstate)  ((hjstate)->hj_NullInnerTupleSlot != NULL)
  27
  28 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
  29                                                   HashJoinState *hjstate,
  30                                                   uint32 *hashvalue);
  31 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
  32                                                   BufFile *file,
  33                                                   uint32 *hashvalue,
  34                                                   TupleTableSlot *tupleSlot);
  35 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
  36
  37
  38 /* ----------------------------------------------------------------
  39  *              ExecHashJoin
  40  *
  41  *              This function implements the Hybrid Hashjoin algorithm.
  42  *
  43  *              Note: the relation we build hash table on is the "inner"
  44  *                        the other one is "outer".
  45  * ----------------------------------------------------------------
  46  */
  47 TupleTableSlot *                                /* return: a tuple or NULL */
  48 ExecHashJoin(HashJoinState *node)
  49 {
  50         EState     *estate;
  51         PlanState  *outerNode;
  52         HashState  *hashNode;
  53         List       *joinqual;
  54         List       *otherqual;
  55         TupleTableSlot *inntuple;
  56         ExprContext *econtext;
  57         ExprDoneCond isDone;
  58         HashJoinTable hashtable;
  59         HashJoinTuple curtuple;
  60         TupleTableSlot *outerTupleSlot;
  61         uint32          hashvalue;
  62         int                     batchno;
  63
  64         /*
  65          * get information from HashJoin node
  66          */
  67         estate = node->js.ps.state;
  68         joinqual = node->js.joinqual;
  69         otherqual = node->js.ps.qual;
  70         hashNode = (HashState *) innerPlanState(node);
  71         outerNode = outerPlanState(node);
  72
  73         /*
  74          * get information from HashJoin state
  75          */
  76         hashtable = node->hj_HashTable;
  77         econtext = node->js.ps.ps_ExprContext;
  78
  79         /*
  80          * Check to see if we're still projecting out tuples from a previous join
  81          * tuple (because there is a function-returning-set in the projection
  82          * expressions).  If so, try to project another one.
  83          */
  84         if (node->js.ps.ps_TupFromTlist)
  85         {
  86                 TupleTableSlot *result;
  87
  88                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
  89                 if (isDone == ExprMultipleResult)
  90                         return result;
  91                 /* Done with that source tuple... */
  92                 node->js.ps.ps_TupFromTlist = false;
  93         }
  94
  95         /*
  96          * Reset per-tuple memory context to free any expression evaluation
  97          * storage allocated in the previous tuple cycle.  Note this can't happen
  98          * until we're done projecting out tuples from a join tuple.
  99          */
 100         ResetExprContext(econtext);
 101
 102         /*
 103          * if this is the first call, build the hash table for inner relation
 104          */
 105         if (hashtable == NULL)
 106         {
 107                 /*
 108                  * If the outer relation is completely empty, we can quit without
 109                  * building the hash table.  However, for an inner join it is only a
 110                  * win to check this when the outer relation's startup cost is less
 111                  * than the projected cost of building the hash table.  Otherwise it's
 112                  * best to build the hash table first and see if the inner relation is
 113                  * empty.  (When it's an outer join, we should always make this check,
 114                  * since we aren't going to be able to skip the join on the strength
 115                  * of an empty inner relation anyway.)
 116                  *
 117                  * If we are rescanning the join, we make use of information gained on
 118                  * the previous scan: don't bother to try the prefetch if the previous
 119                  * scan found the outer relation nonempty.      This is not 100% reliable
 120                  * since with new parameters the outer relation might yield different
 121                  * results, but it's a good heuristic.
 122                  *
 123                  * The only way to make the check is to try to fetch a tuple from the
 124                  * outer plan node.  If we succeed, we have to stash it away for later
 125                  * consumption by ExecHashJoinOuterGetTuple.
 126                  */
 127                 if (HASHJOIN_IS_OUTER(node) ||
 128                         (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
 129                          !node->hj_OuterNotEmpty))
 130                 {
 131                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
 132                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
 133                         {
 134                                 node->hj_OuterNotEmpty = false;
 135                                 return NULL;
 136                         }
 137                         else
 138                                 node->hj_OuterNotEmpty = true;
 139                 }
 140                 else
 141                         node->hj_FirstOuterTupleSlot = NULL;
 142
 143                 /*
 144                  * create the hash table
 145                  */
 146                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
 147                                                                                 node->hj_HashOperators);
 148                 node->hj_HashTable = hashtable;
 149
 150                 /*
 151                  * execute the Hash node, to build the hash table
 152                  */
 153                 hashNode->hashtable = hashtable;
 154                 (void) MultiExecProcNode((PlanState *) hashNode);
 155
 156                 /*
 157                  * If the inner relation is completely empty, and we're not doing an
 158                  * outer join, we can quit without scanning the outer relation.
 159                  */
 160                 if (hashtable->totalTuples == 0 && !HASHJOIN_IS_OUTER(node))
 161                         return NULL;
 162
 163                 /*
 164                  * need to remember whether nbatch has increased since we began
 165                  * scanning the outer relation
 166                  */
 167                 hashtable->nbatch_outstart = hashtable->nbatch;
 168
 169                 /*
 170                  * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
 171                  * above, because ExecHashJoinOuterGetTuple will immediately set it
 172                  * again.)
 173                  */
 174                 node->hj_OuterNotEmpty = false;
 175         }
 176
 177         /*
 178          * run the hash join process
 179          */
 180         for (;;)
 181         {
 182                 /*
 183                  * If we don't have an outer tuple, get the next one
 184                  */
 185                 if (node->hj_NeedNewOuter)
 186                 {
 187                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
 188                                                                                                            node,
 189                                                                                                            &hashvalue);
 190                         if (TupIsNull(outerTupleSlot))
 191                         {
 192                                 /* end of join */
 193                                 return NULL;
 194                         }
 195
 196                         econtext->ecxt_outertuple = outerTupleSlot;
 197                         node->hj_NeedNewOuter = false;
 198                         node->hj_MatchedOuter = false;
 199
 200                         /*
 201                          * Now we have an outer tuple; find the corresponding bucket for
 202                          * this tuple in the main hash table or skew hash table.
 203                          */
 204                         node->hj_CurHashValue = hashvalue;
 205                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
 206                                                                           &node->hj_CurBucketNo, &batchno);
 207                         node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
 208                                                                                                                          hashvalue);
 209                         node->hj_CurTuple = NULL;
 210
 211                         /*
 212                          * Now we've got an outer tuple and the corresponding hash bucket,
 213                          * but it might not belong to the current batch, or it might
 214                          * match a skew bucket.
 215                          */
 216                         if (batchno != hashtable->curbatch &&
 217                                 node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
 218                         {
 219                                 /*
 220                                  * Need to postpone this outer tuple to a later batch. Save it
 221                                  * in the corresponding outer-batch file.
 222                                  */
 223                                 Assert(batchno > hashtable->curbatch);
 224                                 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
 225                                                                           hashvalue,
 226                                                                           &hashtable->outerBatchFile[batchno]);
 227                                 node->hj_NeedNewOuter = true;
 228                                 continue;               /* loop around for a new outer tuple */
 229                         }
 230                 }
 231
 232                 /*
 233                  * OK, scan the selected hash bucket for matches
 234                  */
 235                 for (;;)
 236                 {
 237                         curtuple = ExecScanHashBucket(node, econtext);
 238                         if (curtuple == NULL)
 239                                 break;                  /* out of matches */
 240
 241                         /*
 242                          * we've got a match, but still need to test non-hashed quals
 243                          */
 244                         inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
 245                                                                                          node->hj_HashTupleSlot,
 246                                                                                          false);        /* don't pfree */
 247                         econtext->ecxt_innertuple = inntuple;
 248
 249                         /* reset temp memory each time to avoid leaks from qual expr */
 250                         ResetExprContext(econtext);
 251
 252                         /*
 253                          * if we pass the qual, then save state for next call and have
 254                          * ExecProject form the projection, store it in the tuple table,
 255                          * and return the slot.
 256                          *
 257                          * Only the joinquals determine MatchedOuter status, but all quals
 258                          * must pass to actually return the tuple.
 259                          */
 260                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
 261                         {
 262                                 node->hj_MatchedOuter = true;
 263
 264                                 /* In an antijoin, we never return a matched tuple */
 265                                 if (node->js.jointype == JOIN_ANTI)
 266                                 {
 267                                         node->hj_NeedNewOuter = true;
 268                                         break;          /* out of loop over hash bucket */
 269                                 }
 270
 271                                 /*
 272                                  * In a semijoin, we'll consider returning the first match,
 273                                  * but after that we're done with this outer tuple.
 274                                  */
 275                                 if (node->js.jointype == JOIN_SEMI)
 276                                         node->hj_NeedNewOuter = true;
 277
 278                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
 279                                 {
 280                                         TupleTableSlot *result;
 281
 282                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 283
 284                                         if (isDone != ExprEndResult)
 285                                         {
 286                                                 node->js.ps.ps_TupFromTlist =
 287                                                         (isDone == ExprMultipleResult);
 288                                                 return result;
 289                                         }
 290                                 }
 291
 292                                 /*
 293                                  * If semijoin and we didn't return the tuple, we're still
 294                                  * done with this outer tuple.
 295                                  */
 296                                 if (node->js.jointype == JOIN_SEMI)
 297                                         break;          /* out of loop over hash bucket */
 298                         }
 299                 }
 300
 301                 /*
 302                  * Now the current outer tuple has run out of matches, so check
 303                  * whether to emit a dummy outer-join tuple. If not, loop around to
 304                  * get a new outer tuple.
 305                  */
 306                 node->hj_NeedNewOuter = true;
 307
 308                 if (!node->hj_MatchedOuter &&
 309                         HASHJOIN_IS_OUTER(node))
 310                 {
 311                         /*
 312                          * We are doing an outer join and there were no join matches for
 313                          * this outer tuple.  Generate a fake join tuple with nulls for
 314                          * the inner tuple, and return it if it passes the non-join quals.
 315                          */
 316                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
 317
 318                         if (otherqual == NIL || ExecQual(otherqual, econtext, false))
 319                         {
 320                                 /*
 321                                  * qualification was satisfied so we project and return the
 322                                  * slot containing the result tuple using ExecProject().
 323                                  */
 324                                 TupleTableSlot *result;
 325
 326                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 327
 328                                 if (isDone != ExprEndResult)
 329                                 {
 330                                         node->js.ps.ps_TupFromTlist =
 331                                                 (isDone == ExprMultipleResult);
 332                                         return result;
 333                                 }
 334                         }
 335                 }
 336         }
 337 }
 338
 339 /* ----------------------------------------------------------------
 340  *              ExecInitHashJoin
 341  *
 342  *              Init routine for HashJoin node.
 343  * ----------------------------------------------------------------
 344  */
 345 HashJoinState *
 346 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 347 {
 348         HashJoinState *hjstate;
 349         Plan       *outerNode;
 350         Hash       *hashNode;
 351         List       *lclauses;
 352         List       *rclauses;
 353         List       *hoperators;
 354         ListCell   *l;
 355
 356         /* check for unsupported flags */
 357         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 358
 359         /*
 360          * create state structure
 361          */
 362         hjstate = makeNode(HashJoinState);
 363         hjstate->js.ps.plan = (Plan *) node;
 364         hjstate->js.ps.state = estate;
 365
 366         /*
 367          * Miscellaneous initialization
 368          *
 369          * create expression context for node
 370          */
 371         ExecAssignExprContext(estate, &hjstate->js.ps);
 372
 373         /*
 374          * initialize child expressions
 375          */
 376         hjstate->js.ps.targetlist = (List *)
 377                 ExecInitExpr((Expr *) node->join.plan.targetlist,
 378                                          (PlanState *) hjstate);
 379         hjstate->js.ps.qual = (List *)
 380                 ExecInitExpr((Expr *) node->join.plan.qual,
 381                                          (PlanState *) hjstate);
 382         hjstate->js.jointype = node->join.jointype;
 383         hjstate->js.joinqual = (List *)
 384                 ExecInitExpr((Expr *) node->join.joinqual,
 385                                          (PlanState *) hjstate);
 386         hjstate->hashclauses = (List *)
 387                 ExecInitExpr((Expr *) node->hashclauses,
 388                                          (PlanState *) hjstate);
 389
 390         /*
 391          * initialize child nodes
 392          *
 393          * Note: we could suppress the REWIND flag for the inner input, which
 394          * would amount to betting that the hash will be a single batch.  Not
 395          * clear if this would be a win or not.
 396          */
 397         outerNode = outerPlan(node);
 398         hashNode = (Hash *) innerPlan(node);
 399
 400         outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
 401         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
 402
 403 #define HASHJOIN_NSLOTS 3
 404
 405         /*
 406          * tuple table initialization
 407          */
 408         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
 409         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
 410
 411         /* note: HASHJOIN_IS_OUTER macro depends on this initialization */
 412         switch (node->join.jointype)
 413         {
 414                 case JOIN_INNER:
 415                 case JOIN_SEMI:
 416                         break;
 417                 case JOIN_LEFT:
 418                 case JOIN_ANTI:
 419                         hjstate->hj_NullInnerTupleSlot =
 420                                 ExecInitNullTupleSlot(estate,
 421                                                                  ExecGetResultType(innerPlanState(hjstate)));
 422                         break;
 423                 default:
 424                         elog(ERROR, "unrecognized join type: %d",
 425                                  (int) node->join.jointype);
 426         }
 427
 428         /*
 429          * now for some voodoo.  our temporary tuple slot is actually the result
 430          * tuple slot of the Hash node (which is our inner plan).  we do this
 431          * because Hash nodes don't return tuples via ExecProcNode() -- instead
 432          * the hash join node uses ExecScanHashBucket() to get at the contents of
 433          * the hash table.      -cim 6/9/91
 434          */
 435         {
 436                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
 437                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
 438
 439                 hjstate->hj_HashTupleSlot = slot;
 440         }
 441
 442         /*
 443          * initialize tuple type and projection info
 444          */
 445         ExecAssignResultTypeFromTL(&hjstate->js.ps);
 446         ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
 447
 448         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
 449                                                   ExecGetResultType(outerPlanState(hjstate)));
 450
 451         /*
 452          * initialize hash-specific info
 453          */
 454         hjstate->hj_HashTable = NULL;
 455         hjstate->hj_FirstOuterTupleSlot = NULL;
 456
 457         hjstate->hj_CurHashValue = 0;
 458         hjstate->hj_CurBucketNo = 0;
 459         hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
 460         hjstate->hj_CurTuple = NULL;
 461
 462         /*
 463          * Deconstruct the hash clauses into outer and inner argument values, so
 464          * that we can evaluate those subexpressions separately.  Also make a list
 465          * of the hash operator OIDs, in preparation for looking up the hash
 466          * functions to use.
 467          */
 468         lclauses = NIL;
 469         rclauses = NIL;
 470         hoperators = NIL;
 471         foreach(l, hjstate->hashclauses)
 472         {
 473                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
 474                 OpExpr     *hclause;
 475
 476                 Assert(IsA(fstate, FuncExprState));
 477                 hclause = (OpExpr *) fstate->xprstate.expr;
 478                 Assert(IsA(hclause, OpExpr));
 479                 lclauses = lappend(lclauses, linitial(fstate->args));
 480                 rclauses = lappend(rclauses, lsecond(fstate->args));
 481                 hoperators = lappend_oid(hoperators, hclause->opno);
 482         }
 483         hjstate->hj_OuterHashKeys = lclauses;
 484         hjstate->hj_InnerHashKeys = rclauses;
 485         hjstate->hj_HashOperators = hoperators;
 486         /* child Hash node needs to evaluate inner hash keys, too */
 487         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
 488
 489         hjstate->js.ps.ps_TupFromTlist = false;
 490         hjstate->hj_NeedNewOuter = true;
 491         hjstate->hj_MatchedOuter = false;
 492         hjstate->hj_OuterNotEmpty = false;
 493
 494         return hjstate;
 495 }
 496
 497 int
 498 ExecCountSlotsHashJoin(HashJoin *node)
 499 {
 500         return ExecCountSlotsNode(outerPlan(node)) +
 501                 ExecCountSlotsNode(innerPlan(node)) +
 502                 HASHJOIN_NSLOTS;
 503 }
 504
 505 /* ----------------------------------------------------------------
 506  *              ExecEndHashJoin
 507  *
 508  *              clean up routine for HashJoin node
 509  * ----------------------------------------------------------------
 510  */
 511 void
 512 ExecEndHashJoin(HashJoinState *node)
 513 {
 514         /*
 515          * Free hash table
 516          */
 517         if (node->hj_HashTable)
 518         {
 519                 ExecHashTableDestroy(node->hj_HashTable);
 520                 node->hj_HashTable = NULL;
 521         }
 522
 523         /*
 524          * Free the exprcontext
 525          */
 526         ExecFreeExprContext(&node->js.ps);
 527
 528         /*
 529          * clean out the tuple table
 530          */
 531         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
 532         ExecClearTuple(node->hj_OuterTupleSlot);
 533         ExecClearTuple(node->hj_HashTupleSlot);
 534
 535         /*
 536          * clean up subtrees
 537          */
 538         ExecEndNode(outerPlanState(node));
 539         ExecEndNode(innerPlanState(node));
 540 }
 541
 542 /*
 543  * ExecHashJoinOuterGetTuple
 544  *
 545  *              get the next outer tuple for hashjoin: either by
 546  *              executing a plan node in the first pass, or from
 547  *              the temp files for the hashjoin batches.
 548  *
 549  * Returns a null slot if no more outer tuples.  On success, the tuple's
 550  * hash value is stored at *hashvalue --- this is either originally computed,
 551  * or re-read from the temp file.
 552  */
 553 static TupleTableSlot *
 554 ExecHashJoinOuterGetTuple(PlanState *outerNode,
 555                                                   HashJoinState *hjstate,
 556                                                   uint32 *hashvalue)
 557 {
 558         HashJoinTable hashtable = hjstate->hj_HashTable;
 559         int                     curbatch = hashtable->curbatch;
 560         TupleTableSlot *slot;
 561
 562         if (curbatch == 0)                      /* if it is the first pass */
 563         {
 564                 /*
 565                  * Check to see if first outer tuple was already fetched by
 566                  * ExecHashJoin() and not used yet.
 567                  */
 568                 slot = hjstate->hj_FirstOuterTupleSlot;
 569                 if (!TupIsNull(slot))
 570                         hjstate->hj_FirstOuterTupleSlot = NULL;
 571                 else
 572                         slot = ExecProcNode(outerNode);
 573
 574                 while (!TupIsNull(slot))
 575                 {
 576                         /*
 577                          * We have to compute the tuple's hash value.
 578                          */
 579                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
 580
 581                         econtext->ecxt_outertuple = slot;
 582                         if (ExecHashGetHashValue(hashtable, econtext,
 583                                                                          hjstate->hj_OuterHashKeys,
 584                                                                          true,          /* outer tuple */
 585                                                                          HASHJOIN_IS_OUTER(hjstate),
 586                                                                          hashvalue))
 587                         {
 588                                 /* remember outer relation is not empty for possible rescan */
 589                                 hjstate->hj_OuterNotEmpty = true;
 590
 591                                 return slot;
 592                         }
 593
 594                         /*
 595                          * That tuple couldn't match because of a NULL, so discard it and
 596                          * continue with the next one.
 597                          */
 598                         slot = ExecProcNode(outerNode);
 599                 }
 600
 601                 /*
 602                  * We have just reached the end of the first pass. Try to switch to a
 603                  * saved batch.
 604                  */
 605                 curbatch = ExecHashJoinNewBatch(hjstate);
 606         }
 607
 608         /*
 609          * Try to read from a temp file. Loop allows us to advance to new batches
 610          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
 611          * don't try to optimize this loop.
 612          */
 613         while (curbatch < hashtable->nbatch)
 614         {
 615                 slot = ExecHashJoinGetSavedTuple(hjstate,
 616                                                                                  hashtable->outerBatchFile[curbatch],
 617                                                                                  hashvalue,
 618                                                                                  hjstate->hj_OuterTupleSlot);
 619                 if (!TupIsNull(slot))
 620                         return slot;
 621                 curbatch = ExecHashJoinNewBatch(hjstate);
 622         }
 623
 624         /* Out of batches... */
 625         return NULL;
 626 }
 627
 628 /*
 629  * ExecHashJoinNewBatch
 630  *              switch to a new hashjoin batch
 631  *
 632  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 633  * We will never return a batch number that has an empty outer batch file.
 634  */
 635 static int
 636 ExecHashJoinNewBatch(HashJoinState *hjstate)
 637 {
 638         HashJoinTable hashtable = hjstate->hj_HashTable;
 639         int                     nbatch;
 640         int                     curbatch;
 641         BufFile    *innerFile;
 642         TupleTableSlot *slot;
 643         uint32          hashvalue;
 644
 645 start_over:
 646         nbatch = hashtable->nbatch;
 647         curbatch = hashtable->curbatch;
 648
 649         if (curbatch > 0)
 650         {
 651                 /*
 652                  * We no longer need the previous outer batch file; close it right
 653                  * away to free disk space.
 654                  */
 655                 if (hashtable->outerBatchFile[curbatch])
 656                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 657                 hashtable->outerBatchFile[curbatch] = NULL;
 658         }
 659         else                                            /* we just finished the first batch */
 660         {
 661                 /*
 662                  * Reset some of the skew optimization state variables, since we
 663                  * no longer need to consider skew tuples after the first batch.
 664                  * The memory context reset we are about to do will release the
 665                  * skew hashtable itself.
 666                  */
 667                 hashtable->skewEnabled = false;
 668                 hashtable->skewBucket = NULL;
 669                 hashtable->skewBucketNums = NULL;
 670                 hashtable->spaceUsedSkew = 0;
 671         }
 672
 673         /*
 674          * We can always skip over any batches that are completely empty on both
 675          * sides.  We can sometimes skip over batches that are empty on only one
 676          * side, but there are exceptions:
 677          *
 678          * 1. In an outer join, we have to process outer batches even if the inner
 679          * batch is empty.
 680          *
 681          * 2. If we have increased nbatch since the initial estimate, we have to
 682          * scan inner batches since they might contain tuples that need to be
 683          * reassigned to later inner batches.
 684          *
 685          * 3. Similarly, if we have increased nbatch since starting the outer
 686          * scan, we have to rescan outer batches in case they contain tuples that
 687          * need to be reassigned.
 688          */
 689         curbatch++;
 690         while (curbatch < nbatch &&
 691                    (hashtable->outerBatchFile[curbatch] == NULL ||
 692                         hashtable->innerBatchFile[curbatch] == NULL))
 693         {
 694                 if (hashtable->outerBatchFile[curbatch] &&
 695                         HASHJOIN_IS_OUTER(hjstate))
 696                         break;                          /* must process due to rule 1 */
 697                 if (hashtable->innerBatchFile[curbatch] &&
 698                         nbatch != hashtable->nbatch_original)
 699                         break;                          /* must process due to rule 2 */
 700                 if (hashtable->outerBatchFile[curbatch] &&
 701                         nbatch != hashtable->nbatch_outstart)
 702                         break;                          /* must process due to rule 3 */
 703                 /* We can ignore this batch. */
 704                 /* Release associated temp files right away. */
 705                 if (hashtable->innerBatchFile[curbatch])
 706                         BufFileClose(hashtable->innerBatchFile[curbatch]);
 707                 hashtable->innerBatchFile[curbatch] = NULL;
 708                 if (hashtable->outerBatchFile[curbatch])
 709                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 710                 hashtable->outerBatchFile[curbatch] = NULL;
 711                 curbatch++;
 712         }
 713
 714         if (curbatch >= nbatch)
 715                 return curbatch;                /* no more batches */
 716
 717         hashtable->curbatch = curbatch;
 718
 719         /*
 720          * Reload the hash table with the new inner batch (which could be empty)
 721          */
 722         ExecHashTableReset(hashtable);
 723
 724         innerFile = hashtable->innerBatchFile[curbatch];
 725
 726         if (innerFile != NULL)
 727         {
 728                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
 729                         ereport(ERROR,
 730                                         (errcode_for_file_access(),
 731                                    errmsg("could not rewind hash-join temporary file: %m")));
 732
 733                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
 734                                                                                                  innerFile,
 735                                                                                                  &hashvalue,
 736                                                                                                  hjstate->hj_HashTupleSlot)))
 737                 {
 738                         /*
 739                          * NOTE: some tuples may be sent to future batches.  Also, it is
 740                          * possible for hashtable->nbatch to be increased here!
 741                          */
 742                         ExecHashTableInsert(hashtable, slot, hashvalue);
 743                 }
 744
 745                 /*
 746                  * after we build the hash table, the inner batch file is no longer
 747                  * needed
 748                  */
 749                 BufFileClose(innerFile);
 750                 hashtable->innerBatchFile[curbatch] = NULL;
 751         }
 752
 753         /*
 754          * If there's no outer batch file, advance to next batch.
 755          */
 756         if (hashtable->outerBatchFile[curbatch] == NULL)
 757                 goto start_over;
 758
 759         /*
 760          * Rewind outer batch file, so that we can start reading it.
 761          */
 762         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
 763                 ereport(ERROR,
 764                                 (errcode_for_file_access(),
 765                                  errmsg("could not rewind hash-join temporary file: %m")));
 766
 767         return curbatch;
 768 }
 769
 770 /*
 771  * ExecHashJoinSaveTuple
 772  *              save a tuple to a batch file.
 773  *
 774  * The data recorded in the file for each tuple is its hash value,
 775  * then the tuple in MinimalTuple format.
 776  *
 777  * Note: it is important always to call this in the regular executor
 778  * context, not in a shorter-lived context; else the temp file buffers
 779  * will get messed up.
 780  */
 781 void
 782 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
 783                                           BufFile **fileptr)
 784 {
 785         BufFile    *file = *fileptr;
 786         size_t          written;
 787
 788         if (file == NULL)
 789         {
 790                 /* First write to this batch file, so open it. */
 791                 file = BufFileCreateTemp(false);
 792                 *fileptr = file;
 793         }
 794
 795         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
 796         if (written != sizeof(uint32))
 797                 ereport(ERROR,
 798                                 (errcode_for_file_access(),
 799                                  errmsg("could not write to hash-join temporary file: %m")));
 800
 801         written = BufFileWrite(file, (void *) tuple, tuple->t_len);
 802         if (written != tuple->t_len)
 803                 ereport(ERROR,
 804                                 (errcode_for_file_access(),
 805                                  errmsg("could not write to hash-join temporary file: %m")));
 806 }
 807
 808 /*
 809  * ExecHashJoinGetSavedTuple
 810  *              read the next tuple from a batch file.  Return NULL if no more.
 811  *
 812  * On success, *hashvalue is set to the tuple's hash value, and the tuple
 813  * itself is stored in the given slot.
 814  */
 815 static TupleTableSlot *
 816 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 817                                                   BufFile *file,
 818                                                   uint32 *hashvalue,
 819                                                   TupleTableSlot *tupleSlot)
 820 {
 821         uint32          header[2];
 822         size_t          nread;
 823         MinimalTuple tuple;
 824
 825         /*
 826          * Since both the hash value and the MinimalTuple length word are uint32,
 827          * we can read them both in one BufFileRead() call without any type
 828          * cheating.
 829          */
 830         nread = BufFileRead(file, (void *) header, sizeof(header));
 831         if (nread == 0)                         /* end of file */
 832         {
 833                 ExecClearTuple(tupleSlot);
 834                 return NULL;
 835         }
 836         if (nread != sizeof(header))
 837                 ereport(ERROR,
 838                                 (errcode_for_file_access(),
 839                                  errmsg("could not read from hash-join temporary file: %m")));
 840         *hashvalue = header[0];
 841         tuple = (MinimalTuple) palloc(header[1]);
 842         tuple->t_len = header[1];
 843         nread = BufFileRead(file,
 844                                                 (void *) ((char *) tuple + sizeof(uint32)),
 845                                                 header[1] - sizeof(uint32));
 846         if (nread != header[1] - sizeof(uint32))
 847                 ereport(ERROR,
 848                                 (errcode_for_file_access(),
 849                                  errmsg("could not read from hash-join temporary file: %m")));
 850         return ExecStoreMinimalTuple(tuple, tupleSlot, true);
 851 }
 852
 853
 854 void
 855 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
 856 {
 857         /*
 858          * In a multi-batch join, we currently have to do rescans the hard way,
 859          * primarily because batch temp files may have already been released. But
 860          * if it's a single-batch join, and there is no parameter change for the
 861          * inner subnode, then we can just re-use the existing hash table without
 862          * rebuilding it.
 863          */
 864         if (node->hj_HashTable != NULL)
 865         {
 866                 if (node->hj_HashTable->nbatch == 1 &&
 867                         ((PlanState *) node)->righttree->chgParam == NULL)
 868                 {
 869                         /*
 870                          * okay to reuse the hash table; needn't rescan inner, either.
 871                          *
 872                          * What we do need to do is reset our state about the emptiness of
 873                          * the outer relation, so that the new scan of the outer will
 874                          * update it correctly if it turns out to be empty this time.
 875                          * (There's no harm in clearing it now because ExecHashJoin won't
 876                          * need the info.  In the other cases, where the hash table
 877                          * doesn't exist or we are destroying it, we leave this state
 878                          * alone because ExecHashJoin will need it the first time
 879                          * through.)
 880                          */
 881                         node->hj_OuterNotEmpty = false;
 882                 }
 883                 else
 884                 {
 885                         /* must destroy and rebuild hash table */
 886                         ExecHashTableDestroy(node->hj_HashTable);
 887                         node->hj_HashTable = NULL;
 888
 889                         /*
 890                          * if chgParam of subnode is not null then plan will be re-scanned
 891                          * by first ExecProcNode.
 892                          */
 893                         if (((PlanState *) node)->righttree->chgParam == NULL)
 894                                 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
 895                 }
 896         }
 897
 898         /* Always reset intra-tuple state */
 899         node->hj_CurHashValue = 0;
 900         node->hj_CurBucketNo = 0;
 901         node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
 902         node->hj_CurTuple = NULL;
 903
 904         node->js.ps.ps_TupFromTlist = false;
 905         node->hj_NeedNewOuter = true;
 906         node->hj_MatchedOuter = false;
 907         node->hj_FirstOuterTupleSlot = NULL;
 908
 909         /*
 910          * if chgParam of subnode is not null then plan will be re-scanned by
 911          * first ExecProcNode.
 912          */
 913         if (((PlanState *) node)->lefttree->chgParam == NULL)
 914                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
 915 }