]> granicus.if.org Git - postgresql/blob - src/backend/executor/nodeHash.c
Rename SortMem and VacuumMem to work_mem and maintenance_work_mem.
[postgresql] / src / backend / executor / nodeHash.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHash.c
4  *        Routines to hash relations for hashjoin
5  *
6  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.82 2004/02/03 17:34:02 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * INTERFACE ROUTINES
17  *              ExecHash                - generate an in-memory hash table of the relation
18  *              ExecInitHash    - initialize node and subnodes
19  *              ExecEndHash             - shutdown node and subnodes
20  */
21 #include "postgres.h"
22
23 #include "executor/execdebug.h"
24 #include "executor/nodeHash.h"
25 #include "executor/nodeHashjoin.h"
26 #include "miscadmin.h"
27 #include "parser/parse_expr.h"
28 #include "utils/memutils.h"
29 #include "utils/lsyscache.h"
30
31
32 /* ----------------------------------------------------------------
33  *              ExecHash
34  *
35  *              build hash table for hashjoin, all do partitioning if more
36  *              than one batches are required.
37  * ----------------------------------------------------------------
38  */
39 TupleTableSlot *
40 ExecHash(HashState *node)
41 {
42         EState     *estate;
43         PlanState  *outerNode;
44         List       *hashkeys;
45         HashJoinTable hashtable;
46         TupleTableSlot *slot;
47         ExprContext *econtext;
48         int                     nbatch;
49         int                     i;
50
51         /*
52          * get state info from node
53          */
54         estate = node->ps.state;
55         outerNode = outerPlanState(node);
56
57         hashtable = node->hashtable;
58         nbatch = hashtable->nbatch;
59
60         if (nbatch > 0)
61         {
62                 /*
63                  * Open temp files for inner batches, if needed. Note that file
64                  * buffers are palloc'd in regular executor context.
65                  */
66                 for (i = 0; i < nbatch; i++)
67                         hashtable->innerBatchFile[i] = BufFileCreateTemp(false);
68         }
69
70         /*
71          * set expression context
72          */
73         hashkeys = node->hashkeys;
74         econtext = node->ps.ps_ExprContext;
75
76         /*
77          * get all inner tuples and insert into the hash table (or temp files)
78          */
79         for (;;)
80         {
81                 slot = ExecProcNode(outerNode);
82                 if (TupIsNull(slot))
83                         break;
84                 econtext->ecxt_innertuple = slot;
85                 ExecHashTableInsert(hashtable, econtext, hashkeys);
86                 ExecClearTuple(slot);
87         }
88
89         /*
90          * Return the slot so that we have the tuple descriptor when we need
91          * to save/restore them.  -Jeff 11 July 1991
92          */
93         return slot;
94 }
95
96 /* ----------------------------------------------------------------
97  *              ExecInitHash
98  *
99  *              Init routine for Hash node
100  * ----------------------------------------------------------------
101  */
102 HashState *
103 ExecInitHash(Hash *node, EState *estate)
104 {
105         HashState  *hashstate;
106
107         SO_printf("ExecInitHash: initializing hash node\n");
108
109         /*
110          * create state structure
111          */
112         hashstate = makeNode(HashState);
113         hashstate->ps.plan = (Plan *) node;
114         hashstate->ps.state = estate;
115         hashstate->hashtable = NULL;
116         hashstate->hashkeys = NIL;      /* will be set by parent HashJoin */
117
118         /*
119          * Miscellaneous initialization
120          *
121          * create expression context for node
122          */
123         ExecAssignExprContext(estate, &hashstate->ps);
124
125 #define HASH_NSLOTS 1
126
127         /*
128          * initialize our result slot
129          */
130         ExecInitResultTupleSlot(estate, &hashstate->ps);
131
132         /*
133          * initialize child expressions
134          */
135         hashstate->ps.targetlist = (List *)
136                 ExecInitExpr((Expr *) node->plan.targetlist,
137                                          (PlanState *) hashstate);
138         hashstate->ps.qual = (List *)
139                 ExecInitExpr((Expr *) node->plan.qual,
140                                          (PlanState *) hashstate);
141
142         /*
143          * initialize child nodes
144          */
145         outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate);
146
147         /*
148          * initialize tuple type. no need to initialize projection info
149          * because this node doesn't do projections
150          */
151         ExecAssignResultTypeFromOuterPlan(&hashstate->ps);
152         hashstate->ps.ps_ProjInfo = NULL;
153
154         return hashstate;
155 }
156
157 int
158 ExecCountSlotsHash(Hash *node)
159 {
160         return ExecCountSlotsNode(outerPlan(node)) +
161                 ExecCountSlotsNode(innerPlan(node)) +
162                 HASH_NSLOTS;
163 }
164
165 /* ---------------------------------------------------------------
166  *              ExecEndHash
167  *
168  *              clean up routine for Hash node
169  * ----------------------------------------------------------------
170  */
171 void
172 ExecEndHash(HashState *node)
173 {
174         PlanState  *outerPlan;
175
176         /*
177          * free exprcontext
178          */
179         ExecFreeExprContext(&node->ps);
180
181         /*
182          * shut down the subplan
183          */
184         outerPlan = outerPlanState(node);
185         ExecEndNode(outerPlan);
186 }
187
188
189 /* ----------------------------------------------------------------
190  *              ExecHashTableCreate
191  *
192  *              create a hashtable in shared memory for hashjoin.
193  * ----------------------------------------------------------------
194  */
195 HashJoinTable
196 ExecHashTableCreate(Hash *node, List *hashOperators)
197 {
198         HashJoinTable hashtable;
199         Plan       *outerNode;
200         int                     totalbuckets;
201         int                     nbuckets;
202         int                     nbatch;
203         int                     nkeys;
204         int                     i;
205         List       *ho;
206         MemoryContext oldcxt;
207
208         /*
209          * Get information about the size of the relation to be hashed (it's
210          * the "outer" subtree of this node, but the inner relation of the
211          * hashjoin).  Compute the appropriate size of the hash table.
212          */
213         outerNode = outerPlan(node);
214
215         ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width,
216                                                         &totalbuckets, &nbuckets, &nbatch);
217
218 #ifdef HJDEBUG
219         printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n",
220                    nbatch, totalbuckets, nbuckets);
221 #endif
222
223         /*
224          * Initialize the hash table control block.
225          *
226          * The hashtable control block is just palloc'd from the executor's
227          * per-query memory context.
228          */
229         hashtable = (HashJoinTable) palloc(sizeof(HashTableData));
230         hashtable->nbuckets = nbuckets;
231         hashtable->totalbuckets = totalbuckets;
232         hashtable->buckets = NULL;
233         hashtable->nbatch = nbatch;
234         hashtable->curbatch = 0;
235         hashtable->innerBatchFile = NULL;
236         hashtable->outerBatchFile = NULL;
237         hashtable->innerBatchSize = NULL;
238         hashtable->outerBatchSize = NULL;
239
240         /*
241          * Get info about the hash functions to be used for each hash key.
242          */
243         nkeys = length(hashOperators);
244         hashtable->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
245         i = 0;
246         foreach(ho, hashOperators)
247         {
248                 Oid                     hashfn;
249
250                 hashfn = get_op_hash_function(lfirsto(ho));
251                 if (!OidIsValid(hashfn))
252                         elog(ERROR, "could not find hash function for hash operator %u",
253                                  lfirsto(ho));
254                 fmgr_info(hashfn, &hashtable->hashfunctions[i]);
255                 i++;
256         }
257
258         /*
259          * Create temporary memory contexts in which to keep the hashtable
260          * working storage.  See notes in executor/hashjoin.h.
261          */
262         hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
263                                                                                            "HashTableContext",
264                                                                                            ALLOCSET_DEFAULT_MINSIZE,
265                                                                                            ALLOCSET_DEFAULT_INITSIZE,
266                                                                                            ALLOCSET_DEFAULT_MAXSIZE);
267
268         hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
269                                                                                                 "HashBatchContext",
270                                                                                                 ALLOCSET_DEFAULT_MINSIZE,
271                                                                                                 ALLOCSET_DEFAULT_INITSIZE,
272                                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
273
274         /* Allocate data that will live for the life of the hashjoin */
275
276         oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
277
278         if (nbatch > 0)
279         {
280                 /*
281                  * allocate and initialize the file arrays in hashCxt
282                  */
283                 hashtable->innerBatchFile = (BufFile **)
284                         palloc0(nbatch * sizeof(BufFile *));
285                 hashtable->outerBatchFile = (BufFile **)
286                         palloc0(nbatch * sizeof(BufFile *));
287                 hashtable->innerBatchSize = (long *)
288                         palloc0(nbatch * sizeof(long));
289                 hashtable->outerBatchSize = (long *)
290                         palloc0(nbatch * sizeof(long));
291                 /* The files will not be opened until later... */
292         }
293
294         /*
295          * Prepare context for the first-scan space allocations; allocate the
296          * hashbucket array therein, and set each bucket "empty".
297          */
298         MemoryContextSwitchTo(hashtable->batchCxt);
299
300         hashtable->buckets = (HashJoinTuple *)
301                 palloc0(nbuckets * sizeof(HashJoinTuple));
302
303         MemoryContextSwitchTo(oldcxt);
304
305         return hashtable;
306 }
307
308
309 /*
310  * Compute appropriate size for hashtable given the estimated size of the
311  * relation to be hashed (number of rows and average row width).
312  *
313  * Caution: the input is only the planner's estimates, and so can't be
314  * trusted too far.  Apply a healthy fudge factor.
315  *
316  * This is exported so that the planner's costsize.c can use it.
317  */
318
319 /* Target bucket loading (tuples per bucket) */
320 #define NTUP_PER_BUCKET                 10
321 /* Fudge factor to allow for inaccuracy of input estimates */
322 #define FUDGE_FAC                               2.0
323
324 void
325 ExecChooseHashTableSize(double ntuples, int tupwidth,
326                                                 int *virtualbuckets,
327                                                 int *physicalbuckets,
328                                                 int *numbatches)
329 {
330         int                     tupsize;
331         double          inner_rel_bytes;
332         long            hash_table_bytes;
333         double          dtmp;
334         int                     nbatch;
335         int                     nbuckets;
336         int                     totalbuckets;
337         int                     bucketsize;
338
339         /* Force a plausible relation size if no info */
340         if (ntuples <= 0.0)
341                 ntuples = 1000.0;
342
343         /*
344          * Estimate tupsize based on footprint of tuple in hashtable... but
345          * what about palloc overhead?
346          */
347         tupsize = MAXALIGN(tupwidth) + MAXALIGN(sizeof(HashJoinTupleData));
348         inner_rel_bytes = ntuples * tupsize * FUDGE_FAC;
349
350         /*
351          * Target in-memory hashtable size is work_mem kilobytes.
352          */
353         hash_table_bytes = work_mem * 1024L;
354
355         /*
356          * Count the number of hash buckets we want for the whole relation,
357          * for an average bucket load of NTUP_PER_BUCKET (per virtual
358          * bucket!).  It has to fit in an int, however.
359          */
360         dtmp = ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET);
361         if (dtmp < INT_MAX)
362                 totalbuckets = (int) dtmp;
363         else
364                 totalbuckets = INT_MAX;
365         if (totalbuckets <= 0)
366                 totalbuckets = 1;
367
368         /*
369          * Count the number of buckets we think will actually fit in the
370          * target memory size, at a loading of NTUP_PER_BUCKET (physical
371          * buckets). NOTE: FUDGE_FAC here determines the fraction of the
372          * hashtable space reserved to allow for nonuniform distribution of
373          * hash values. Perhaps this should be a different number from the
374          * other uses of FUDGE_FAC, but since we have no real good way to pick
375          * either one...
376          */
377         bucketsize = NTUP_PER_BUCKET * tupsize;
378         nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC));
379         if (nbuckets <= 0)
380                 nbuckets = 1;
381
382         if (totalbuckets <= nbuckets)
383         {
384                 /*
385                  * We have enough space, so no batching.  In theory we could even
386                  * reduce nbuckets, but since that could lead to poor behavior if
387                  * estimated ntuples is much less than reality, it seems better to
388                  * make more buckets instead of fewer.
389                  */
390                 totalbuckets = nbuckets;
391                 nbatch = 0;
392         }
393         else
394         {
395                 /*
396                  * Need to batch; compute how many batches we want to use. Note
397                  * that nbatch doesn't have to have anything to do with the ratio
398                  * totalbuckets/nbuckets; in fact, it is the number of groups we
399                  * will use for the part of the data that doesn't fall into the
400                  * first nbuckets hash buckets.  We try to set it to make all the
401                  * batches the same size.
402                  */
403                 dtmp = ceil((inner_rel_bytes - hash_table_bytes) /
404                                         hash_table_bytes);
405                 if (dtmp < INT_MAX)
406                         nbatch = (int) dtmp;
407                 else
408                         nbatch = INT_MAX;
409                 if (nbatch <= 0)
410                         nbatch = 1;
411         }
412
413         /*
414          * Now, totalbuckets is the number of (virtual) hashbuckets for the
415          * whole relation, and nbuckets is the number of physical hashbuckets
416          * we will use in the first pass.  Data falling into the first
417          * nbuckets virtual hashbuckets gets handled in the first pass;
418          * everything else gets divided into nbatch batches to be processed in
419          * additional passes.
420          */
421         *virtualbuckets = totalbuckets;
422         *physicalbuckets = nbuckets;
423         *numbatches = nbatch;
424 }
425
426
427 /* ----------------------------------------------------------------
428  *              ExecHashTableDestroy
429  *
430  *              destroy a hash table
431  * ----------------------------------------------------------------
432  */
433 void
434 ExecHashTableDestroy(HashJoinTable hashtable)
435 {
436         int                     i;
437
438         /* Make sure all the temp files are closed */
439         for (i = 0; i < hashtable->nbatch; i++)
440         {
441                 if (hashtable->innerBatchFile[i])
442                         BufFileClose(hashtable->innerBatchFile[i]);
443                 if (hashtable->outerBatchFile[i])
444                         BufFileClose(hashtable->outerBatchFile[i]);
445         }
446
447         /* Release working memory (batchCxt is a child, so it goes away too) */
448         MemoryContextDelete(hashtable->hashCxt);
449
450         /* And drop the control block */
451         pfree(hashtable);
452 }
453
454 /* ----------------------------------------------------------------
455  *              ExecHashTableInsert
456  *
457  *              insert a tuple into the hash table depending on the hash value
458  *              it may just go to a tmp file for other batches
459  * ----------------------------------------------------------------
460  */
461 void
462 ExecHashTableInsert(HashJoinTable hashtable,
463                                         ExprContext *econtext,
464                                         List *hashkeys)
465 {
466         int                     bucketno = ExecHashGetBucket(hashtable, econtext, hashkeys);
467         int                     batchno = ExecHashGetBatch(bucketno, hashtable);
468         TupleTableSlot *slot = econtext->ecxt_innertuple;
469         HeapTuple       heapTuple = slot->val;
470
471         /*
472          * decide whether to put the tuple in the hash table or a tmp file
473          */
474         if (batchno < 0)
475         {
476                 /*
477                  * put the tuple in hash table
478                  */
479                 HashJoinTuple hashTuple;
480                 int                     hashTupleSize;
481
482                 hashTupleSize = MAXALIGN(sizeof(*hashTuple)) + heapTuple->t_len;
483                 hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
484                                                                                                            hashTupleSize);
485                 memcpy((char *) &hashTuple->htup,
486                            (char *) heapTuple,
487                            sizeof(hashTuple->htup));
488                 hashTuple->htup.t_datamcxt = hashtable->batchCxt;
489                 hashTuple->htup.t_data = (HeapTupleHeader)
490                         (((char *) hashTuple) + MAXALIGN(sizeof(*hashTuple)));
491                 memcpy((char *) hashTuple->htup.t_data,
492                            (char *) heapTuple->t_data,
493                            heapTuple->t_len);
494                 hashTuple->next = hashtable->buckets[bucketno];
495                 hashtable->buckets[bucketno] = hashTuple;
496         }
497         else
498         {
499                 /*
500                  * put the tuple into a tmp file for later batches
501                  */
502                 hashtable->innerBatchSize[batchno]++;
503                 ExecHashJoinSaveTuple(heapTuple,
504                                                           hashtable->innerBatchFile[batchno]);
505         }
506 }
507
508 /* ----------------------------------------------------------------
509  *              ExecHashGetBucket
510  *
511  *              Get the hash value for a tuple
512  * ----------------------------------------------------------------
513  */
514 int
515 ExecHashGetBucket(HashJoinTable hashtable,
516                                   ExprContext *econtext,
517                                   List *hashkeys)
518 {
519         uint32          hashkey = 0;
520         int                     bucketno;
521         List       *hk;
522         int                     i = 0;
523         MemoryContext oldContext;
524
525         /*
526          * We reset the eval context each time to reclaim any memory leaked in
527          * the hashkey expressions.
528          */
529         ResetExprContext(econtext);
530
531         oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
532
533         foreach(hk, hashkeys)
534         {
535                 Datum           keyval;
536                 bool            isNull;
537
538                 /* rotate hashkey left 1 bit at each step */
539                 hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
540
541                 /*
542                  * Get the join attribute value of the tuple
543                  */
544                 keyval = ExecEvalExpr((ExprState *) lfirst(hk),
545                                                           econtext, &isNull, NULL);
546
547                 /*
548                  * Compute the hash function
549                  */
550                 if (!isNull)                    /* treat nulls as having hash key 0 */
551                 {
552                         uint32          hkey;
553
554                         hkey = DatumGetUInt32(FunctionCall1(&hashtable->hashfunctions[i],
555                                                                                                 keyval));
556                         hashkey ^= hkey;
557                 }
558
559                 i++;
560         }
561
562         bucketno = hashkey % (uint32) hashtable->totalbuckets;
563
564 #ifdef HJDEBUG
565         if (bucketno >= hashtable->nbuckets)
566                 printf("hash(%u) = %d SAVED\n", hashkey, bucketno);
567         else
568                 printf("hash(%u) = %d\n", hashkey, bucketno);
569 #endif
570
571         MemoryContextSwitchTo(oldContext);
572
573         return bucketno;
574 }
575
576 /* ----------------------------------------------------------------
577  *              ExecHashGetBatch
578  *
579  *              determine the batch number for a bucketno
580  *
581  * Returns -1 if bucket belongs to initial (or current) batch,
582  * else 0..nbatch-1 corresponding to external batch file number for bucket.
583  * ----------------------------------------------------------------
584  */
585 int
586 ExecHashGetBatch(int bucketno, HashJoinTable hashtable)
587 {
588         if (bucketno < hashtable->nbuckets)
589                 return -1;
590
591         return (bucketno - hashtable->nbuckets) % hashtable->nbatch;
592 }
593
594 /* ----------------------------------------------------------------
595  *              ExecScanHashBucket
596  *
597  *              scan a hash bucket of matches
598  * ----------------------------------------------------------------
599  */
600 HeapTuple
601 ExecScanHashBucket(HashJoinState *hjstate,
602                                    List *hjclauses,
603                                    ExprContext *econtext)
604 {
605         HashJoinTable hashtable = hjstate->hj_HashTable;
606         HashJoinTuple hashTuple = hjstate->hj_CurTuple;
607
608         /*
609          * hj_CurTuple is NULL to start scanning a new bucket, or the address
610          * of the last tuple returned from the current bucket.
611          */
612         if (hashTuple == NULL)
613                 hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
614         else
615                 hashTuple = hashTuple->next;
616
617         while (hashTuple != NULL)
618         {
619                 HeapTuple       heapTuple = &hashTuple->htup;
620                 TupleTableSlot *inntuple;
621
622                 /* insert hashtable's tuple into exec slot so ExecQual sees it */
623                 inntuple = ExecStoreTuple(heapTuple,    /* tuple to store */
624                                                                   hjstate->hj_HashTupleSlot,    /* slot */
625                                                                   InvalidBuffer,
626                                                                   false);               /* do not pfree this tuple */
627                 econtext->ecxt_innertuple = inntuple;
628
629                 /* reset temp memory each time to avoid leaks from qual expression */
630                 ResetExprContext(econtext);
631
632                 if (ExecQual(hjclauses, econtext, false))
633                 {
634                         hjstate->hj_CurTuple = hashTuple;
635                         return heapTuple;
636                 }
637
638                 hashTuple = hashTuple->next;
639         }
640
641         /*
642          * no match
643          */
644         return NULL;
645 }
646
647 /* ----------------------------------------------------------------
648  *              ExecHashTableReset
649  *
650  *              reset hash table header for new batch
651  *
652  *              ntuples is the number of tuples in the inner relation's batch
653  *              (which we currently don't actually use...)
654  * ----------------------------------------------------------------
655  */
656 void
657 ExecHashTableReset(HashJoinTable hashtable, long ntuples)
658 {
659         MemoryContext oldcxt;
660         int                     nbuckets = hashtable->nbuckets;
661
662         /*
663          * Release all the hash buckets and tuples acquired in the prior pass,
664          * and reinitialize the context for a new pass.
665          */
666         MemoryContextReset(hashtable->batchCxt);
667         oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
668
669         /*
670          * We still use the same number of physical buckets as in the first
671          * pass. (It could be different; but we already decided how many
672          * buckets would be appropriate for the allowed memory, so stick with
673          * that number.) We MUST set totalbuckets to equal nbuckets, because
674          * from now on no tuples will go out to temp files; there are no more
675          * virtual buckets, only real buckets.  (This implies that tuples will
676          * go into different bucket numbers than they did on the first pass,
677          * but that's OK.)
678          */
679         hashtable->totalbuckets = nbuckets;
680
681         /* Reallocate and reinitialize the hash bucket headers. */
682         hashtable->buckets = (HashJoinTuple *)
683                 palloc0(nbuckets * sizeof(HashJoinTuple));
684
685         MemoryContextSwitchTo(oldcxt);
686 }
687
688 void
689 ExecReScanHash(HashState *node, ExprContext *exprCtxt)
690 {
691         /*
692          * if chgParam of subnode is not null then plan will be re-scanned by
693          * first ExecProcNode.
694          */
695         if (((PlanState *) node)->lefttree->chgParam == NULL)
696                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
697 }