From 849074f9ae422c64501bb1d53ef840de870bf65c Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 6 Mar 2005 22:15:05 +0000 Subject: [PATCH] Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). --- src/backend/executor/nodeHash.c | 511 ++++++++++++++++---------- src/backend/executor/nodeHashjoin.c | 393 +++++++++++--------- src/backend/optimizer/path/costsize.c | 18 +- src/backend/utils/adt/selfuncs.c | 8 +- src/include/executor/hashjoin.h | 75 ++-- src/include/executor/nodeHash.h | 26 +- src/include/executor/nodeHashjoin.h | 8 +- src/include/nodes/execnodes.h | 17 +- src/include/utils/selfuncs.h | 4 +- 9 files changed, 632 insertions(+), 428 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 73cc440bb6..c85755890a 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.88 2004/12/31 21:59:45 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.89 2005/03/06 22:15:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,6 +21,7 @@ #include "postgres.h" #include "executor/execdebug.h" +#include "executor/hashjoin.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "miscadmin.h" @@ -29,6 +30,9 @@ #include "utils/lsyscache.h" +static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); + + /* ---------------------------------------------------------------- * ExecHash * @@ -39,33 +43,19 @@ TupleTableSlot * ExecHash(HashState *node) { - EState *estate; PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; - int nbatch; - int i; + uint32 hashvalue; /* * get state info from node */ - estate = node->ps.state; outerNode = outerPlanState(node); hashtable = node->hashtable; - nbatch = hashtable->nbatch; - - if (nbatch > 0) - { - /* - * Open temp files for inner batches, if needed. Note that file - * buffers are palloc'd in regular executor context. - */ - for (i = 0; i < nbatch; i++) - hashtable->innerBatchFile[i] = BufFileCreateTemp(false); - } /* * set expression context @@ -82,14 +72,15 @@ ExecHash(HashState *node) if (TupIsNull(slot)) break; hashtable->hashNonEmpty = true; + /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; - ExecHashTableInsert(hashtable, econtext, hashkeys); - ExecClearTuple(slot); + hashvalue = ExecHashGetHashValue(hashtable, econtext, hashkeys); + ExecHashTableInsert(hashtable, slot->val, hashvalue); } /* * Return the slot so that we have the tuple descriptor when we need - * to save/restore them. -Jeff 11 July 1991 + * to save/restore them. -Jeff 11 July 1991 (XXX isn't this dead code?) */ return slot; } @@ -198,7 +189,6 @@ ExecHashTableCreate(Hash *node, List *hashOperators) { HashJoinTable hashtable; Plan *outerNode; - int totalbuckets; int nbuckets; int nbatch; int nkeys; @@ -214,11 +204,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators) outerNode = outerPlan(node); ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width, - &totalbuckets, &nbuckets, &nbatch); + &nbuckets, &nbatch); #ifdef HJDEBUG - printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n", - nbatch, totalbuckets, nbuckets); + printf("nbatch = %d, nbuckets = %d\n", nbatch, nbuckets); #endif /* @@ -229,15 +218,17 @@ ExecHashTableCreate(Hash *node, List *hashOperators) */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; - hashtable->totalbuckets = totalbuckets; hashtable->buckets = NULL; hashtable->nbatch = nbatch; hashtable->curbatch = 0; + hashtable->nbatch_original = nbatch; + hashtable->nbatch_outstart = nbatch; + hashtable->growEnabled = true; hashtable->hashNonEmpty = false; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; - hashtable->innerBatchSize = NULL; - hashtable->outerBatchSize = NULL; + hashtable->spaceUsed = 0; + hashtable->spaceAllowed = work_mem * 1024L; /* * Get info about the hash functions to be used for each hash key. @@ -277,7 +268,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators) oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); - if (nbatch > 0) + if (nbatch > 1) { /* * allocate and initialize the file arrays in hashCxt @@ -286,11 +277,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators) palloc0(nbatch * sizeof(BufFile *)); hashtable->outerBatchFile = (BufFile **) palloc0(nbatch * sizeof(BufFile *)); - hashtable->innerBatchSize = (long *) - palloc0(nbatch * sizeof(long)); - hashtable->outerBatchSize = (long *) - palloc0(nbatch * sizeof(long)); - /* The files will not be opened until later... */ + /* The files will not be opened until needed... */ } /* @@ -312,42 +299,44 @@ ExecHashTableCreate(Hash *node, List *hashOperators) * Compute appropriate size for hashtable given the estimated size of the * relation to be hashed (number of rows and average row width). * - * Caution: the input is only the planner's estimates, and so can't be - * trusted too far. Apply a healthy fudge factor. - * * This is exported so that the planner's costsize.c can use it. */ /* Target bucket loading (tuples per bucket) */ #define NTUP_PER_BUCKET 10 -/* Fudge factor to allow for inaccuracy of input estimates */ -#define FUDGE_FAC 2.0 + +/* Prime numbers that we like to use as nbuckets values */ +static const int hprimes[] = { + 1033, 2063, 4111, 8219, 16417, 32779, 65539, 131111, + 262151, 524341, 1048589, 2097211, 4194329, 8388619, 16777289, 33554473, + 67108913, 134217773, 268435463, 536870951, 1073741831 +}; void ExecChooseHashTableSize(double ntuples, int tupwidth, - int *virtualbuckets, - int *physicalbuckets, + int *numbuckets, int *numbatches) { int tupsize; double inner_rel_bytes; long hash_table_bytes; - double dtmp; int nbatch; int nbuckets; - int totalbuckets; - int bucketsize; + int i; /* Force a plausible relation size if no info */ if (ntuples <= 0.0) ntuples = 1000.0; /* - * Estimate tupsize based on footprint of tuple in hashtable... but - * what about palloc overhead? + * Estimate tupsize based on footprint of tuple in hashtable... note + * this does not allow for any palloc overhead. The manipulations of + * spaceUsed don't count palloc overhead either. */ - tupsize = MAXALIGN(tupwidth) + MAXALIGN(sizeof(HashJoinTupleData)); - inner_rel_bytes = ntuples * tupsize * FUDGE_FAC; + tupsize = MAXALIGN(sizeof(HashJoinTupleData)) + + MAXALIGN(sizeof(HeapTupleHeaderData)) + + MAXALIGN(tupwidth); + inner_rel_bytes = ntuples * tupsize; /* * Target in-memory hashtable size is work_mem kilobytes. @@ -355,73 +344,57 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, hash_table_bytes = work_mem * 1024L; /* - * Count the number of hash buckets we want for the whole relation, - * for an average bucket load of NTUP_PER_BUCKET (per virtual - * bucket!). It has to fit in an int, however. - */ - dtmp = ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET); - if (dtmp < INT_MAX) - totalbuckets = (int) dtmp; - else - totalbuckets = INT_MAX; - if (totalbuckets <= 0) - totalbuckets = 1; - - /* - * Count the number of buckets we think will actually fit in the - * target memory size, at a loading of NTUP_PER_BUCKET (physical - * buckets). NOTE: FUDGE_FAC here determines the fraction of the - * hashtable space reserved to allow for nonuniform distribution of - * hash values. Perhaps this should be a different number from the - * other uses of FUDGE_FAC, but since we have no real good way to pick - * either one... + * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when + * memory is filled. Set nbatch to the smallest power of 2 that appears + * sufficient. */ - bucketsize = NTUP_PER_BUCKET * tupsize; - nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC)); - if (nbuckets <= 0) - nbuckets = 1; - - if (totalbuckets <= nbuckets) + if (inner_rel_bytes > hash_table_bytes) { - /* - * We have enough space, so no batching. In theory we could even - * reduce nbuckets, but since that could lead to poor behavior if - * estimated ntuples is much less than reality, it seems better to - * make more buckets instead of fewer. - */ - totalbuckets = nbuckets; - nbatch = 0; + /* We'll need multiple batches */ + long lbuckets; + double dbatch; + int minbatch; + + lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET; + lbuckets = Min(lbuckets, INT_MAX); + nbuckets = (int) lbuckets; + + dbatch = ceil(inner_rel_bytes / hash_table_bytes); + dbatch = Min(dbatch, INT_MAX/2); + minbatch = (int) dbatch; + nbatch = 2; + while (nbatch < minbatch) + nbatch <<= 1; } else { - /* - * Need to batch; compute how many batches we want to use. Note - * that nbatch doesn't have to have anything to do with the ratio - * totalbuckets/nbuckets; in fact, it is the number of groups we - * will use for the part of the data that doesn't fall into the - * first nbuckets hash buckets. We try to set it to make all the - * batches the same size. - */ - dtmp = ceil((inner_rel_bytes - hash_table_bytes) / - hash_table_bytes); - if (dtmp < INT_MAX) - nbatch = (int) dtmp; - else - nbatch = INT_MAX; - if (nbatch <= 0) - nbatch = 1; + /* We expect the hashtable to fit in memory */ + double dbuckets; + + dbuckets = ceil(ntuples / NTUP_PER_BUCKET); + dbuckets = Min(dbuckets, INT_MAX); + nbuckets = (int) dbuckets; + + nbatch = 1; } /* - * Now, totalbuckets is the number of (virtual) hashbuckets for the - * whole relation, and nbuckets is the number of physical hashbuckets - * we will use in the first pass. Data falling into the first - * nbuckets virtual hashbuckets gets handled in the first pass; - * everything else gets divided into nbatch batches to be processed in - * additional passes. + * We want nbuckets to be prime so as to avoid having bucket and batch + * numbers depend on only some bits of the hash code. Choose the next + * larger prime from the list in hprimes[]. (This also enforces that + * nbuckets is not very small, by the simple expedient of not putting + * any very small entries in hprimes[].) */ - *virtualbuckets = totalbuckets; - *physicalbuckets = nbuckets; + for (i = 0; i < (int) lengthof(hprimes); i++) + { + if (hprimes[i] >= nbuckets) + { + nbuckets = hprimes[i]; + break; + } + } + + *numbuckets = nbuckets; *numbatches = nbatch; } @@ -437,8 +410,12 @@ ExecHashTableDestroy(HashJoinTable hashtable) { int i; - /* Make sure all the temp files are closed */ - for (i = 0; i < hashtable->nbatch; i++) + /* + * Make sure all the temp files are closed. We skip batch 0, since it + * can't have any temp files (and the arrays might not even exist if + * nbatch is only 1). + */ + for (i = 1; i < hashtable->nbatch; i++) { if (hashtable->innerBatchFile[i]) BufFileClose(hashtable->innerBatchFile[i]); @@ -453,27 +430,159 @@ ExecHashTableDestroy(HashJoinTable hashtable) pfree(hashtable); } -/* ---------------------------------------------------------------- - * ExecHashTableInsert - * +/* + * ExecHashIncreaseNumBatches + * increase the original number of batches in order to reduce + * current memory consumption + */ +static void +ExecHashIncreaseNumBatches(HashJoinTable hashtable) +{ + int oldnbatch = hashtable->nbatch; + int curbatch = hashtable->curbatch; + int nbatch; + int i; + MemoryContext oldcxt; + long ninmemory; + long nfreed; + + /* do nothing if we've decided to shut off growth */ + if (!hashtable->growEnabled) + return; + + /* safety check to avoid overflow */ + if (oldnbatch > INT_MAX/2) + return; + + nbatch = oldnbatch * 2; + Assert(nbatch > 1); + +#ifdef HJDEBUG + printf("Increasing nbatch to %d because space = %lu\n", + nbatch, (unsigned long) hashtable->spaceUsed); +#endif + + oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); + + if (hashtable->innerBatchFile == NULL) + { + /* we had no file arrays before */ + hashtable->innerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + hashtable->outerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + } + else + { + /* enlarge arrays and zero out added entries */ + hashtable->innerBatchFile = (BufFile **) + repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *)); + hashtable->outerBatchFile = (BufFile **) + repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *)); + MemSet(hashtable->innerBatchFile + oldnbatch, 0, + (nbatch - oldnbatch) * sizeof(BufFile *)); + MemSet(hashtable->outerBatchFile + oldnbatch, 0, + (nbatch - oldnbatch) * sizeof(BufFile *)); + } + + MemoryContextSwitchTo(oldcxt); + + hashtable->nbatch = nbatch; + + /* + * Scan through the existing hash table entries and dump out any + * that are no longer of the current batch. + */ + ninmemory = nfreed = 0; + + for (i = 0; i < hashtable->nbuckets; i++) + { + HashJoinTuple prevtuple; + HashJoinTuple tuple; + + prevtuple = NULL; + tuple = hashtable->buckets[i]; + + while (tuple != NULL) + { + /* save link in case we delete */ + HashJoinTuple nexttuple = tuple->next; + int bucketno; + int batchno; + + ninmemory++; + ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue, + &bucketno, &batchno); + Assert(bucketno == i); + if (batchno == curbatch) + { + /* keep tuple */ + prevtuple = tuple; + } + else + { + /* dump it out */ + Assert(batchno > curbatch); + ExecHashJoinSaveTuple(&tuple->htup, tuple->hashvalue, + &hashtable->innerBatchFile[batchno]); + /* and remove from hash table */ + if (prevtuple) + prevtuple->next = nexttuple; + else + hashtable->buckets[i] = nexttuple; + /* prevtuple doesn't change */ + hashtable->spaceUsed -= + MAXALIGN(sizeof(HashJoinTupleData)) + tuple->htup.t_len; + pfree(tuple); + nfreed++; + } + + tuple = nexttuple; + } + } + +#ifdef HJDEBUG + printf("Freed %ld of %ld tuples, space now %lu\n", + nfreed, ninmemory, (unsigned long) hashtable->spaceUsed); +#endif + + /* + * If we dumped out either all or none of the tuples in the table, + * disable further expansion of nbatch. This situation implies that + * we have enough tuples of identical hashvalues to overflow spaceAllowed. + * Increasing nbatch will not fix it since there's no way to subdivide + * the group any more finely. + * We have to just gut it out and hope the server has enough RAM. + */ + if (nfreed == 0 || nfreed == ninmemory) + { + hashtable->growEnabled = false; +#ifdef HJDEBUG + printf("Disabling further increase of nbatch\n"); +#endif + } +} + +/* + * ExecHashTableInsert * insert a tuple into the hash table depending on the hash value - * it may just go to a tmp file for other batches - * ---------------------------------------------------------------- + * it may just go to a temp file for later batches */ void ExecHashTableInsert(HashJoinTable hashtable, - ExprContext *econtext, - List *hashkeys) + HeapTuple tuple, + uint32 hashvalue) { - int bucketno = ExecHashGetBucket(hashtable, econtext, hashkeys); - int batchno = ExecHashGetBatch(bucketno, hashtable); - TupleTableSlot *slot = econtext->ecxt_innertuple; - HeapTuple heapTuple = slot->val; + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &bucketno, &batchno); /* - * decide whether to put the tuple in the hash table or a tmp file + * decide whether to put the tuple in the hash table or a temp file */ - if (batchno < 0) + if (batchno == hashtable->curbatch) { /* * put the tuple in hash table @@ -481,45 +590,50 @@ ExecHashTableInsert(HashJoinTable hashtable, HashJoinTuple hashTuple; int hashTupleSize; - hashTupleSize = MAXALIGN(sizeof(*hashTuple)) + heapTuple->t_len; + hashTupleSize = MAXALIGN(sizeof(HashJoinTupleData)) + tuple->t_len; hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt, hashTupleSize); + hashTuple->hashvalue = hashvalue; memcpy((char *) &hashTuple->htup, - (char *) heapTuple, + (char *) tuple, sizeof(hashTuple->htup)); hashTuple->htup.t_datamcxt = hashtable->batchCxt; hashTuple->htup.t_data = (HeapTupleHeader) - (((char *) hashTuple) + MAXALIGN(sizeof(*hashTuple))); + (((char *) hashTuple) + MAXALIGN(sizeof(HashJoinTupleData))); memcpy((char *) hashTuple->htup.t_data, - (char *) heapTuple->t_data, - heapTuple->t_len); + (char *) tuple->t_data, + tuple->t_len); hashTuple->next = hashtable->buckets[bucketno]; hashtable->buckets[bucketno] = hashTuple; + hashtable->spaceUsed += hashTupleSize; + if (hashtable->spaceUsed > hashtable->spaceAllowed) + ExecHashIncreaseNumBatches(hashtable); } else { /* - * put the tuple into a tmp file for later batches + * put the tuple into a temp file for later batches */ - hashtable->innerBatchSize[batchno]++; - ExecHashJoinSaveTuple(heapTuple, - hashtable->innerBatchFile[batchno]); + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(tuple, hashvalue, + &hashtable->innerBatchFile[batchno]); } } -/* ---------------------------------------------------------------- - * ExecHashGetBucket +/* + * ExecHashGetHashValue + * Compute the hash value for a tuple * - * Get the hash value for a tuple - * ---------------------------------------------------------------- + * The tuple to be tested must be in either econtext->ecxt_outertuple or + * econtext->ecxt_innertuple. Vars in the hashkeys expressions reference + * either OUTER or INNER. */ -int -ExecHashGetBucket(HashJoinTable hashtable, - ExprContext *econtext, - List *hashkeys) +uint32 +ExecHashGetHashValue(HashJoinTable hashtable, + ExprContext *econtext, + List *hashkeys) { uint32 hashkey = 0; - int bucketno; ListCell *hk; int i = 0; MemoryContext oldContext; @@ -561,51 +675,63 @@ ExecHashGetBucket(HashJoinTable hashtable, i++; } - bucketno = hashkey % (uint32) hashtable->totalbuckets; - -#ifdef HJDEBUG - if (bucketno >= hashtable->nbuckets) - printf("hash(%u) = %d SAVED\n", hashkey, bucketno); - else - printf("hash(%u) = %d\n", hashkey, bucketno); -#endif - MemoryContextSwitchTo(oldContext); - return bucketno; + return hashkey; } -/* ---------------------------------------------------------------- - * ExecHashGetBatch +/* + * ExecHashGetBucketAndBatch + * Determine the bucket number and batch number for a hash value * - * determine the batch number for a bucketno + * Note: on-the-fly increases of nbatch must not change the bucket number + * for a given hash code (since we don't move tuples to different hash + * chains), and must only cause the batch number to remain the same or + * increase. Our algorithm is + * bucketno = hashvalue MOD nbuckets + * batchno = (hashvalue DIV nbuckets) MOD nbatch + * where nbuckets should preferably be prime so that all bits of the + * hash value can affect both bucketno and batchno. + * nbuckets doesn't change over the course of the join. * - * Returns -1 if bucket belongs to initial (or current) batch, - * else 0..nbatch-1 corresponding to external batch file number for bucket. - * ---------------------------------------------------------------- + * nbatch is always a power of 2; we increase it only by doubling it. This + * effectively adds one more bit to the top of the batchno. */ -int -ExecHashGetBatch(int bucketno, HashJoinTable hashtable) +void +ExecHashGetBucketAndBatch(HashJoinTable hashtable, + uint32 hashvalue, + int *bucketno, + int *batchno) { - if (bucketno < hashtable->nbuckets) - return -1; + uint32 nbuckets = (uint32) hashtable->nbuckets; + uint32 nbatch = (uint32) hashtable->nbatch; - return (bucketno - hashtable->nbuckets) % hashtable->nbatch; + if (nbatch > 1) + { + *bucketno = hashvalue % nbuckets; + *batchno = (hashvalue / nbuckets) % nbatch; + } + else + { + *bucketno = hashvalue % nbuckets; + *batchno = 0; + } } -/* ---------------------------------------------------------------- - * ExecScanHashBucket +/* + * ExecScanHashBucket + * scan a hash bucket for matches to the current outer tuple * - * scan a hash bucket of matches - * ---------------------------------------------------------------- + * The current outer tuple must be stored in econtext->ecxt_outertuple. */ HeapTuple ExecScanHashBucket(HashJoinState *hjstate, - List *hjclauses, ExprContext *econtext) { + List *hjclauses = hjstate->hashclauses; HashJoinTable hashtable = hjstate->hj_HashTable; HashJoinTuple hashTuple = hjstate->hj_CurTuple; + uint32 hashvalue = hjstate->hj_CurHashValue; /* * hj_CurTuple is NULL to start scanning a new bucket, or the address @@ -618,23 +744,26 @@ ExecScanHashBucket(HashJoinState *hjstate, while (hashTuple != NULL) { - HeapTuple heapTuple = &hashTuple->htup; - TupleTableSlot *inntuple; - - /* insert hashtable's tuple into exec slot so ExecQual sees it */ - inntuple = ExecStoreTuple(heapTuple, /* tuple to store */ - hjstate->hj_HashTupleSlot, /* slot */ - InvalidBuffer, - false); /* do not pfree this tuple */ - econtext->ecxt_innertuple = inntuple; - - /* reset temp memory each time to avoid leaks from qual expression */ - ResetExprContext(econtext); - - if (ExecQual(hjclauses, econtext, false)) + if (hashTuple->hashvalue == hashvalue) { - hjstate->hj_CurTuple = hashTuple; - return heapTuple; + HeapTuple heapTuple = &hashTuple->htup; + TupleTableSlot *inntuple; + + /* insert hashtable's tuple into exec slot so ExecQual sees it */ + inntuple = ExecStoreTuple(heapTuple, + hjstate->hj_HashTupleSlot, + InvalidBuffer, + false); /* do not pfree */ + econtext->ecxt_innertuple = inntuple; + + /* reset temp memory each time to avoid leaks from qual expr */ + ResetExprContext(econtext); + + if (ExecQual(hjclauses, econtext, false)) + { + hjstate->hj_CurTuple = hashTuple; + return heapTuple; + } } hashTuple = hashTuple->next; @@ -646,17 +775,13 @@ ExecScanHashBucket(HashJoinState *hjstate, return NULL; } -/* ---------------------------------------------------------------- - * ExecHashTableReset +/* + * ExecHashTableReset * * reset hash table header for new batch - * - * ntuples is the number of tuples in the inner relation's batch - * (which we currently don't actually use...) - * ---------------------------------------------------------------- */ void -ExecHashTableReset(HashJoinTable hashtable, long ntuples) +ExecHashTableReset(HashJoinTable hashtable) { MemoryContext oldcxt; int nbuckets = hashtable->nbuckets; @@ -668,22 +793,12 @@ ExecHashTableReset(HashJoinTable hashtable, long ntuples) MemoryContextReset(hashtable->batchCxt); oldcxt = MemoryContextSwitchTo(hashtable->batchCxt); - /* - * We still use the same number of physical buckets as in the first - * pass. (It could be different; but we already decided how many - * buckets would be appropriate for the allowed memory, so stick with - * that number.) We MUST set totalbuckets to equal nbuckets, because - * from now on no tuples will go out to temp files; there are no more - * virtual buckets, only real buckets. (This implies that tuples will - * go into different bucket numbers than they did on the first pass, - * but that's OK.) - */ - hashtable->totalbuckets = nbuckets; - /* Reallocate and reinitialize the hash bucket headers. */ hashtable->buckets = (HashJoinTuple *) palloc0(nbuckets * sizeof(HashJoinTuple)); + hashtable->spaceUsed = 0; + MemoryContextSwitchTo(oldcxt); } diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 07063af84f..4f4eb701d3 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.67 2004/12/31 21:59:45 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.68 2005/03/06 22:15:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,16 +16,19 @@ #include "postgres.h" #include "executor/executor.h" +#include "executor/hashjoin.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "optimizer/clauses.h" #include "utils/memutils.h" -static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *node, - HashJoinState *hjstate); +static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue); static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, + uint32 *hashvalue, TupleTableSlot *tupleSlot); static int ExecHashJoinNewBatch(HashJoinState *hjstate); @@ -34,9 +37,9 @@ static int ExecHashJoinNewBatch(HashJoinState *hjstate); * ExecHashJoin * * This function implements the Hybrid Hashjoin algorithm. - * recursive partitioning remains to be added. - * Note: the relation we build hash table on is the inner - * the other one is outer. + * + * Note: the relation we build hash table on is the "inner" + * the other one is "outer". * ---------------------------------------------------------------- */ TupleTableSlot * /* return: a tuple or NULL */ @@ -45,8 +48,6 @@ ExecHashJoin(HashJoinState *node) EState *estate; PlanState *outerNode; HashState *hashNode; - List *hjclauses; - List *outerkeys; List *joinqual; List *otherqual; ScanDirection dir; @@ -56,12 +57,12 @@ ExecHashJoin(HashJoinState *node) HashJoinTable hashtable; HeapTuple curtuple; TupleTableSlot *outerTupleSlot; - int i; + uint32 hashvalue; + int batchno; /* * get information from HashJoin node */ - hjclauses = node->hashclauses; estate = node->js.ps.state; joinqual = node->js.joinqual; otherqual = node->js.ps.qual; @@ -73,7 +74,6 @@ ExecHashJoin(HashJoinState *node) * get information from HashJoin state */ hashtable = node->hj_HashTable; - outerkeys = node->hj_OuterHashKeys; econtext = node->js.ps.ps_ExprContext; /* @@ -111,12 +111,11 @@ ExecHashJoin(HashJoinState *node) /* * if this is the first call, build the hash table for inner relation */ - if (!node->hj_hashdone) + if (hashtable == NULL) { /* * create the hash table */ - Assert(hashtable == NULL); hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan, node->hj_HashOperators); node->hj_HashTable = hashtable; @@ -139,13 +138,10 @@ ExecHashJoin(HashJoinState *node) } /* - * Open temp files for outer batches, if needed. Note that file - * buffers are palloc'd in regular executor context. + * need to remember whether nbatch has increased since we began + * scanning the outer relation */ - for (i = 0; i < hashtable->nbatch; i++) - hashtable->outerBatchFile[i] = BufFileCreateTemp(false); - - node->hj_hashdone = true; + hashtable->nbatch_outstart = hashtable->nbatch; } /* @@ -159,7 +155,8 @@ ExecHashJoin(HashJoinState *node) if (node->hj_NeedNewOuter) { outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode, - node); + node, + &hashvalue); if (TupIsNull(outerTupleSlot)) { /* end of join */ @@ -175,32 +172,26 @@ ExecHashJoin(HashJoinState *node) * now we have an outer tuple, find the corresponding bucket * for this tuple from the hash table */ - node->hj_CurBucketNo = ExecHashGetBucket(hashtable, econtext, - outerkeys); + node->hj_CurHashValue = hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &node->hj_CurBucketNo, &batchno); node->hj_CurTuple = NULL; /* * Now we've got an outer tuple and the corresponding hash * bucket, but this tuple may not belong to the current batch. - * This need only be checked in the first pass. */ - if (hashtable->curbatch == 0) + if (batchno != hashtable->curbatch) { - int batchno = ExecHashGetBatch(node->hj_CurBucketNo, - hashtable); - - if (batchno >= 0) - { - /* - * Need to postpone this outer tuple to a later batch. - * Save it in the corresponding outer-batch file. - */ - hashtable->outerBatchSize[batchno]++; - ExecHashJoinSaveTuple(outerTupleSlot->val, - hashtable->outerBatchFile[batchno]); - node->hj_NeedNewOuter = true; - continue; /* loop around for a new outer tuple */ - } + /* + * Need to postpone this outer tuple to a later batch. + * Save it in the corresponding outer-batch file. + */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(outerTupleSlot->val, hashvalue, + &hashtable->outerBatchFile[batchno]); + node->hj_NeedNewOuter = true; + continue; /* loop around for a new outer tuple */ } } @@ -209,9 +200,7 @@ ExecHashJoin(HashJoinState *node) */ for (;;) { - curtuple = ExecScanHashBucket(node, - hjclauses, - econtext); + curtuple = ExecScanHashBucket(node, econtext); if (curtuple == NULL) break; /* out of matches */ @@ -412,10 +401,9 @@ ExecInitHashJoin(HashJoin *node, EState *estate) /* * initialize hash-specific info */ - - hjstate->hj_hashdone = false; hjstate->hj_HashTable = NULL; + hjstate->hj_CurHashValue = 0; hjstate->hj_CurBucketNo = 0; hjstate->hj_CurTuple = NULL; @@ -499,17 +487,21 @@ ExecEndHashJoin(HashJoinState *node) ExecEndNode(innerPlanState(node)); } -/* ---------------------------------------------------------------- - * ExecHashJoinOuterGetTuple +/* + * ExecHashJoinOuterGetTuple * * get the next outer tuple for hashjoin: either by - * executing a plan node as in the first pass, or from - * the tmp files for the hashjoin batches. - * ---------------------------------------------------------------- + * executing a plan node in the first pass, or from + * the temp files for the hashjoin batches. + * + * Returns a null slot if no more outer tuples. On success, the tuple's + * hash value is stored at *hashvalue --- this is either originally computed, + * or re-read from the temp file. */ - static TupleTableSlot * -ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) +ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue) { HashJoinTable hashtable = hjstate->hj_HashTable; int curbatch = hashtable->curbatch; @@ -517,9 +509,20 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) if (curbatch == 0) { /* if it is the first pass */ - slot = ExecProcNode(node); + slot = ExecProcNode(outerNode); if (!TupIsNull(slot)) + { + /* + * We have to compute the tuple's hash value. + */ + ExprContext *econtext = hjstate->js.ps.ps_ExprContext; + + econtext->ecxt_outertuple = slot; + *hashvalue = ExecHashGetHashValue(hashtable, econtext, + hjstate->hj_OuterHashKeys); + return slot; + } /* * We have just reached the end of the first pass. Try to switch @@ -530,12 +533,14 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) /* * Try to read from a temp file. Loop allows us to advance to new - * batch as needed. + * batches as needed. NOTE: nbatch could increase inside + * ExecHashJoinNewBatch, so don't try to optimize this loop. */ - while (curbatch <= hashtable->nbatch) + while (curbatch < hashtable->nbatch) { slot = ExecHashJoinGetSavedTuple(hjstate, - hashtable->outerBatchFile[curbatch - 1], + hashtable->outerBatchFile[curbatch], + hashvalue, hjstate->hj_OuterTupleSlot); if (!TupIsNull(slot)) return slot; @@ -546,164 +551,223 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) return NULL; } -/* ---------------------------------------------------------------- - * ExecHashJoinGetSavedTuple - * - * read the next tuple from a tmp file - * ---------------------------------------------------------------- - */ - -static TupleTableSlot * -ExecHashJoinGetSavedTuple(HashJoinState *hjstate, - BufFile *file, - TupleTableSlot *tupleSlot) -{ - HeapTupleData htup; - size_t nread; - HeapTuple heapTuple; - - nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); - if (nread == 0) - return NULL; /* end of file */ - if (nread != sizeof(HeapTupleData)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read from hash-join temporary file: %m"))); - heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); - memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); - heapTuple->t_datamcxt = CurrentMemoryContext; - heapTuple->t_data = (HeapTupleHeader) - ((char *) heapTuple + HEAPTUPLESIZE); - nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); - if (nread != (size_t) htup.t_len) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read from hash-join temporary file: %m"))); - return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); -} - -/* ---------------------------------------------------------------- - * ExecHashJoinNewBatch - * +/* + * ExecHashJoinNewBatch * switch to a new hashjoin batch - * ---------------------------------------------------------------- + * + * Returns the number of the new batch (1..nbatch-1), or nbatch if no more. + * We will never return a batch number that has an empty outer batch file. */ static int ExecHashJoinNewBatch(HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; - int nbatch = hashtable->nbatch; - int newbatch = hashtable->curbatch + 1; - long *innerBatchSize = hashtable->innerBatchSize; - long *outerBatchSize = hashtable->outerBatchSize; + int nbatch; + int curbatch; BufFile *innerFile; TupleTableSlot *slot; - ExprContext *econtext; - List *innerhashkeys; + uint32 hashvalue; - if (newbatch > 1) +start_over: + nbatch = hashtable->nbatch; + curbatch = hashtable->curbatch; + + if (curbatch > 0) { /* * We no longer need the previous outer batch file; close it right * away to free disk space. */ - BufFileClose(hashtable->outerBatchFile[newbatch - 2]); - hashtable->outerBatchFile[newbatch - 2] = NULL; + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; } /* - * Normally we can skip over any batches that are empty on either side - * --- but for JOIN_LEFT, can only skip when left side is empty. - * Release associated temp files right away. + * We can always skip over any batches that are completely empty on both + * sides. We can sometimes skip over batches that are empty on only one + * side, but there are exceptions: + * + * 1. In a LEFT JOIN, we have to process outer batches even if the + * inner batch is empty. + * + * 2. If we have increased nbatch since the initial estimate, we have + * to scan inner batches since they might contain tuples that need to + * be reassigned to later inner batches. + * + * 3. Similarly, if we have increased nbatch since starting the outer + * scan, we have to rescan outer batches in case they contain tuples + * that need to be reassigned. */ - while (newbatch <= nbatch && - (outerBatchSize[newbatch - 1] == 0L || - (innerBatchSize[newbatch - 1] == 0L && - hjstate->js.jointype != JOIN_LEFT))) + curbatch++; + while (curbatch < nbatch && + (hashtable->outerBatchFile[curbatch] == NULL || + hashtable->innerBatchFile[curbatch] == NULL)) { - BufFileClose(hashtable->innerBatchFile[newbatch - 1]); - hashtable->innerBatchFile[newbatch - 1] = NULL; - BufFileClose(hashtable->outerBatchFile[newbatch - 1]); - hashtable->outerBatchFile[newbatch - 1] = NULL; - newbatch++; + if (hashtable->outerBatchFile[curbatch] && + hjstate->js.jointype == JOIN_LEFT) + break; /* must process due to rule 1 */ + if (hashtable->innerBatchFile[curbatch] && + nbatch != hashtable->nbatch_original) + break; /* must process due to rule 2 */ + if (hashtable->outerBatchFile[curbatch] && + nbatch != hashtable->nbatch_outstart) + break; /* must process due to rule 3 */ + /* We can ignore this batch. */ + /* Release associated temp files right away. */ + if (hashtable->innerBatchFile[curbatch]) + BufFileClose(hashtable->innerBatchFile[curbatch]); + hashtable->innerBatchFile[curbatch] = NULL; + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; + curbatch++; } - if (newbatch > nbatch) - return newbatch; /* no more batches */ - - /* - * Rewind inner and outer batch files for this batch, so that we can - * start reading them. - */ - if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file: %m"))); + if (curbatch >= nbatch) + return curbatch; /* no more batches */ - innerFile = hashtable->innerBatchFile[newbatch - 1]; - - if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file: %m"))); + hashtable->curbatch = curbatch; /* - * Reload the hash table with the new inner batch + * Reload the hash table with the new inner batch (which could be empty) */ - ExecHashTableReset(hashtable, innerBatchSize[newbatch - 1]); + ExecHashTableReset(hashtable); - econtext = hjstate->js.ps.ps_ExprContext; - innerhashkeys = hjstate->hj_InnerHashKeys; + innerFile = hashtable->innerBatchFile[curbatch]; - while ((slot = ExecHashJoinGetSavedTuple(hjstate, - innerFile, - hjstate->hj_HashTupleSlot)) - && !TupIsNull(slot)) + if (innerFile != NULL) { - econtext->ecxt_innertuple = slot; - ExecHashTableInsert(hashtable, econtext, innerhashkeys); + if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + while ((slot = ExecHashJoinGetSavedTuple(hjstate, + innerFile, + &hashvalue, + hjstate->hj_HashTupleSlot))) + { + /* + * NOTE: some tuples may be sent to future batches. Also, + * it is possible for hashtable->nbatch to be increased here! + */ + ExecHashTableInsert(hashtable, slot->val, hashvalue); + } + + /* + * after we build the hash table, the inner batch file is no longer + * needed + */ + BufFileClose(innerFile); + hashtable->innerBatchFile[curbatch] = NULL; } /* - * after we build the hash table, the inner batch file is no longer - * needed + * If there's no outer batch file, advance to next batch. */ - BufFileClose(innerFile); - hashtable->innerBatchFile[newbatch - 1] = NULL; + if (hashtable->outerBatchFile[curbatch] == NULL) + goto start_over; - hashtable->curbatch = newbatch; - return newbatch; + /* + * Rewind outer batch file, so that we can start reading it. + */ + if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + return curbatch; } -/* ---------------------------------------------------------------- - * ExecHashJoinSaveTuple +/* + * ExecHashJoinSaveTuple + * save a tuple to a batch file. * - * save a tuple to a tmp file. + * The data recorded in the file for each tuple is its hash value, + * then an image of its HeapTupleData (with meaningless t_data pointer) + * followed by the HeapTupleHeader and tuple data. * - * The data recorded in the file for each tuple is an image of its - * HeapTupleData (with meaningless t_data pointer) followed by the - * HeapTupleHeader and tuple data. - * ---------------------------------------------------------------- + * Note: it is important always to call this in the regular executor + * context, not in a shorter-lived context; else the temp file buffers + * will get messed up. */ - void -ExecHashJoinSaveTuple(HeapTuple heapTuple, - BufFile *file) +ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue, + BufFile **fileptr) { + BufFile *file = *fileptr; size_t written; + if (file == NULL) + { + /* First write to this batch file, so open it. */ + file = BufFileCreateTemp(false); + *fileptr = file; + } + + written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); + if (written != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to hash-join temporary file: %m"))); + written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData)); if (written != sizeof(HeapTupleData)) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not write to hash-join temporary file: %m"))); + errmsg("could not write to hash-join temporary file: %m"))); + written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len); if (written != (size_t) heapTuple->t_len) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not write to hash-join temporary file: %m"))); + errmsg("could not write to hash-join temporary file: %m"))); } +/* + * ExecHashJoinGetSavedTuple + * read the next tuple from a batch file. Return NULL if no more. + * + * On success, *hashvalue is set to the tuple's hash value, and the tuple + * itself is stored in the given slot. + */ +static TupleTableSlot * +ExecHashJoinGetSavedTuple(HashJoinState *hjstate, + BufFile *file, + uint32 *hashvalue, + TupleTableSlot *tupleSlot) +{ + HeapTupleData htup; + size_t nread; + HeapTuple heapTuple; + + nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32)); + if (nread == 0) + return NULL; /* end of file */ + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); + if (nread != sizeof(HeapTupleData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); + memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); + heapTuple->t_datamcxt = CurrentMemoryContext; + heapTuple->t_data = (HeapTupleHeader) + ((char *) heapTuple + HEAPTUPLESIZE); + nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); + if (nread != (size_t) htup.t_len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); +} + + void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) { @@ -711,9 +775,8 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) * If we haven't yet built the hash table then we can just return; * nothing done yet, so nothing to undo. */ - if (!node->hj_hashdone) + if (node->hj_HashTable == NULL) return; - Assert(node->hj_HashTable != NULL); /* * In a multi-batch join, we currently have to do rescans the hard @@ -722,7 +785,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) * parameter change for the inner subnode, then we can just re-use the * existing hash table without rebuilding it. */ - if (node->hj_HashTable->nbatch == 0 && + if (node->hj_HashTable->nbatch == 1 && ((PlanState *) node)->righttree->chgParam == NULL) { /* okay to reuse the hash table; needn't rescan inner, either */ @@ -730,7 +793,6 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) else { /* must destroy and rebuild hash table */ - node->hj_hashdone = false; ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; @@ -743,6 +805,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) } /* Always reset intra-tuple state */ + node->hj_CurHashValue = 0; node->hj_CurBucketNo = 0; node->hj_CurTuple = NULL; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 54a24b9952..e6d2c5cd80 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -49,7 +49,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.137 2004/12/31 22:00:04 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.138 2005/03/06 22:15:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1074,9 +1074,9 @@ cost_hashjoin(HashPath *path, Query *root) double innerbytes = relation_byte_size(inner_path_rows, inner_path->parent->width); int num_hashclauses = list_length(hashclauses); - int virtualbuckets; - int physicalbuckets; + int numbuckets; int numbatches; + double virtualbuckets; Selectivity innerbucketsize; Selectivity joininfactor; ListCell *hcl; @@ -1123,9 +1123,9 @@ cost_hashjoin(HashPath *path, Query *root) /* Get hash table size that executor would use for inner relation */ ExecChooseHashTableSize(inner_path_rows, inner_path->parent->width, - &virtualbuckets, - &physicalbuckets, + &numbuckets, &numbatches); + virtualbuckets = (double) numbuckets * (double) numbatches; /* * Determine bucketsize fraction for inner relation. We use the @@ -1196,13 +1196,13 @@ cost_hashjoin(HashPath *path, Query *root) } /* - * if inner relation is too big then we will need to "batch" the join, + * If inner relation is too big then we will need to "batch" the join, * which implies writing and reading most of the tuples to disk an * extra time. Charge one cost unit per page of I/O (correct since it * should be nice and sequential...). Writing the inner rel counts as * startup cost, all the rest as run cost. */ - if (numbatches) + if (numbatches > 1) { double outerpages = page_size(outer_path_rows, outer_path->parent->width); @@ -1228,7 +1228,9 @@ cost_hashjoin(HashPath *path, Query *root) * The number of tuple comparisons needed is the number of outer * tuples times the typical number of tuples in a hash bucket, which * is the inner relation size times its bucketsize fraction. At each - * one, we need to evaluate the hashjoin quals. + * one, we need to evaluate the hashjoin quals. (Note: charging the + * full qual eval cost at each tuple is pessimistic, since we don't + * evaluate the quals unless the hash values match exactly.) */ startup_cost += hash_qual_cost.startup; run_cost += hash_qual_cost.per_tuple * diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index b4536710eb..c63eff09cd 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.171 2005/02/01 23:07:58 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.172 2005/03/06 22:15:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2154,7 +2154,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) * inner rel is well-dispersed (or the alternatives seem much worse). */ Selectivity -estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets) +estimate_hash_bucketsize(Query *root, Node *hashkey, double nbuckets) { VariableStatData vardata; double estfract, @@ -2212,8 +2212,8 @@ estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets) * the number of buckets is less than the expected number of distinct * values; otherwise it is 1/ndistinct. */ - if (ndistinct > (double) nbuckets) - estfract = 1.0 / (double) nbuckets; + if (ndistinct > nbuckets) + estfract = 1.0 / nbuckets; else estfract = 1.0 / ndistinct; diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index e267f474ed..c0f75922e1 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.34 2004/12/31 22:03:29 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.35 2005/03/06 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -20,11 +20,12 @@ /* ---------------------------------------------------------------- * hash-join hash table structures * - * Each active hashjoin has a HashJoinTable control block which is + * Each active hashjoin has a HashJoinTable control block, which is * palloc'd in the executor's per-query context. All other storage needed * for the hashjoin is kept in private memory contexts, two for each hashjoin. * This makes it easy and fast to release the storage when we don't need it - * anymore. + * anymore. (Exception: data associated with the temp files lives in the + * per-query context too, since we always call buffile.c in that context.) * * The hashtable contexts are made children of the per-query context, ensuring * that they will be discarded at end of statement even if the join is @@ -35,40 +36,64 @@ * "hashCxt", while storage that is only wanted for the current batch is * allocated in the "batchCxt". By resetting the batchCxt at the end of * each batch, we free all the per-batch storage reliably and without tedium. + * + * During first scan of inner relation, we get its tuples from executor. + * If nbatch > 1 then tuples that don't belong in first batch get saved + * into inner-batch temp files. The same statements apply for the + * first scan of the outer relation, except we write tuples to outer-batch + * temp files. After finishing the first scan, we do the following for + * each remaining batch: + * 1. Read tuples from inner batch file, load into hash buckets. + * 2. Read tuples from outer batch file, match to hash buckets and output. + * + * It is possible to increase nbatch on the fly if the in-memory hash table + * gets too big. The hash-value-to-batch computation is arranged so that this + * can only cause a tuple to go into a later batch than previously thought, + * never into an earlier batch. When we increase nbatch, we rescan the hash + * table and dump out any tuples that are now of a later batch to the correct + * inner batch file. Subsequently, while reading either inner or outer batch + * files, we might find tuples that no longer belong to the current batch; + * if so, we just dump them out to the correct batch file. * ---------------------------------------------------------------- */ +/* these are in nodes/execnodes.h: */ +/* typedef struct HashJoinTupleData *HashJoinTuple; */ +/* typedef struct HashJoinTableData *HashJoinTable; */ + typedef struct HashJoinTupleData { - struct HashJoinTupleData *next; /* link to next tuple in same - * bucket */ + struct HashJoinTupleData *next; /* link to next tuple in same bucket */ + uint32 hashvalue; /* tuple's hash code */ HeapTupleData htup; /* tuple header */ } HashJoinTupleData; -typedef HashJoinTupleData *HashJoinTuple; - typedef struct HashJoinTableData { - int nbuckets; /* buckets in use during this batch */ - int totalbuckets; /* total number of (virtual) buckets */ - HashJoinTuple *buckets; /* buckets[i] is head of list of tuples */ + int nbuckets; /* # buckets in the in-memory hash table */ + /* buckets[i] is head of list of tuples in i'th in-memory bucket */ + struct HashJoinTupleData **buckets; /* buckets array is per-batch storage, as are all the tuples */ - int nbatch; /* number of batches; 0 means 1-pass join */ - int curbatch; /* current batch #, or 0 during 1st pass */ + int nbatch; /* number of batches */ + int curbatch; /* current batch #; 0 during 1st pass */ + + int nbatch_original; /* nbatch when we started inner scan */ + int nbatch_outstart; /* nbatch when we started outer scan */ + + bool growEnabled; /* flag to shut off nbatch increases */ bool hashNonEmpty; /* did inner plan produce any rows? */ /* - * all these arrays are allocated for the life of the hash join, but - * only if nbatch > 0: + * These arrays are allocated for the life of the hash join, but + * only if nbatch > 1. A file is opened only when we first write + * a tuple into it (otherwise its pointer remains NULL). Note that + * the zero'th array elements never get used, since we will process + * rather than dump out any tuples of batch zero. */ BufFile **innerBatchFile; /* buffered virtual temp file per batch */ BufFile **outerBatchFile; /* buffered virtual temp file per batch */ - long *outerBatchSize; /* count of tuples in each outer batch - * file */ - long *innerBatchSize; /* count of tuples in each inner batch - * file */ /* * Info about the datatype-specific hash functions for the datatypes @@ -79,21 +104,11 @@ typedef struct HashJoinTableData */ FmgrInfo *hashfunctions; /* lookup data for hash functions */ - /* - * During 1st scan of inner relation, we get tuples from executor. If - * nbatch > 0 then tuples that don't belong in first nbuckets logical - * buckets get dumped into inner-batch temp files. The same statements - * apply for the 1st scan of the outer relation, except we write - * tuples to outer-batch temp files. If nbatch > 0 then we do the - * following for each batch: 1. Read tuples from inner batch file, - * load into hash buckets. 2. Read tuples from outer batch file, match - * to hash buckets and output. - */ + Size spaceUsed; /* memory space currently used by tuples */ + Size spaceAllowed; /* upper limit for space used */ MemoryContext hashCxt; /* context for whole-hash-join storage */ MemoryContext batchCxt; /* context for this-batch-only storage */ } HashJoinTableData; -typedef HashJoinTableData *HashJoinTable; - #endif /* HASHJOIN_H */ diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index 781cfcf838..06d73c060e 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.35 2004/12/31 22:03:29 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.36 2005/03/06 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -25,18 +25,20 @@ extern void ExecReScanHash(HashState *node, ExprContext *exprCtxt); extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators); extern void ExecHashTableDestroy(HashJoinTable hashtable); extern void ExecHashTableInsert(HashJoinTable hashtable, - ExprContext *econtext, - List *hashkeys); -extern int ExecHashGetBucket(HashJoinTable hashtable, - ExprContext *econtext, - List *hashkeys); -extern int ExecHashGetBatch(int bucketno, HashJoinTable hashtable); -extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, List *hjclauses, - ExprContext *econtext); -extern void ExecHashTableReset(HashJoinTable hashtable, long ntuples); + HeapTuple tuple, + uint32 hashvalue); +extern uint32 ExecHashGetHashValue(HashJoinTable hashtable, + ExprContext *econtext, + List *hashkeys); +extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable, + uint32 hashvalue, + int *bucketno, + int *batchno); +extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, + ExprContext *econtext); +extern void ExecHashTableReset(HashJoinTable hashtable); extern void ExecChooseHashTableSize(double ntuples, int tupwidth, - int *virtualbuckets, - int *physicalbuckets, + int *numbuckets, int *numbatches); #endif /* NODEHASH_H */ diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h index 1902c11fb8..44e942317d 100644 --- a/src/include/executor/nodeHashjoin.h +++ b/src/include/executor/nodeHashjoin.h @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * nodeHashjoin.h - * + * prototypes for nodeHashjoin.c * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.28 2004/12/31 22:03:29 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.29 2005/03/06 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,6 +15,7 @@ #define NODEHASHJOIN_H #include "nodes/execnodes.h" +#include "storage/buffile.h" extern int ExecCountSlotsHashJoin(HashJoin *node); extern HashJoinState *ExecInitHashJoin(HashJoin *node, EState *estate); @@ -22,6 +23,7 @@ extern TupleTableSlot *ExecHashJoin(HashJoinState *node); extern void ExecEndHashJoin(HashJoinState *node); extern void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt); -extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, BufFile *file); +extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue, + BufFile **fileptr); #endif /* NODEHASHJOIN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index a9bfd5c5d9..6cc4334299 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.122 2004/12/31 22:03:34 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.123 2005/03/06 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,7 +15,6 @@ #define EXECNODES_H #include "access/relscan.h" -#include "executor/hashjoin.h" #include "executor/tuptable.h" #include "fmgr.h" #include "nodes/bitmapset.h" @@ -985,11 +984,13 @@ typedef struct MergeJoinState * HashJoinState information * * hj_HashTable hash table for the hashjoin + * (NULL if table not built yet) + * hj_CurHashValue hash value for current outer tuple * hj_CurBucketNo bucket# for current outer tuple * hj_CurTuple last inner tuple matched to current outer * tuple, or NULL if starting search - * (CurBucketNo and CurTuple are meaningless - * unless OuterTupleSlot is nonempty!) + * (CurHashValue, CurBucketNo and CurTuple are + * undefined if OuterTupleSlot is empty!) * hj_OuterHashKeys the outer hash keys in the hashjoin condition * hj_InnerHashKeys the inner hash keys in the hashjoin condition * hj_HashOperators the join operators in the hashjoin condition @@ -998,14 +999,19 @@ typedef struct MergeJoinState * hj_NullInnerTupleSlot prepared null tuple for left outer joins * hj_NeedNewOuter true if need new outer tuple on next call * hj_MatchedOuter true if found a join match for current outer - * hj_hashdone true if hash-table-build phase is done * ---------------- */ + +/* these structs are defined in executor/hashjoin.h: */ +typedef struct HashJoinTupleData *HashJoinTuple; +typedef struct HashJoinTableData *HashJoinTable; + typedef struct HashJoinState { JoinState js; /* its first field is NodeTag */ List *hashclauses; /* list of ExprState nodes */ HashJoinTable hj_HashTable; + uint32 hj_CurHashValue; int hj_CurBucketNo; HashJoinTuple hj_CurTuple; List *hj_OuterHashKeys; /* list of ExprState nodes */ @@ -1016,7 +1022,6 @@ typedef struct HashJoinState TupleTableSlot *hj_NullInnerTupleSlot; bool hj_NeedNewOuter; bool hj_MatchedOuter; - bool hj_hashdone; } HashJoinState; diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index cb212e0de7..5fbc9d3ae6 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.21 2004/12/31 22:03:46 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.22 2005/03/06 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -121,7 +121,7 @@ extern double estimate_num_groups(Query *root, List *groupExprs, double input_rows); extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey, - int nbuckets); + double nbuckets); extern Datum btcostestimate(PG_FUNCTION_ARGS); extern Datum rtcostestimate(PG_FUNCTION_ARGS); -- 2.40.0