]> granicus.if.org Git - postgresql/commitdiff
Convert hash join code to use MinimalTuple format in tuple hash table
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 27 Jun 2006 21:31:20 +0000 (21:31 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 27 Jun 2006 21:31:20 +0000 (21:31 +0000)
and batch files.  Should reduce memory and I/O demands for such joins.

src/backend/executor/execTuples.c
src/backend/executor/nodeHash.c
src/backend/executor/nodeHashjoin.c
src/include/executor/hashjoin.h
src/include/executor/nodeHash.h
src/include/executor/nodeHashjoin.h
src/include/executor/tuptable.h

index f03d738619dd290c9f7c08a9b96ef6f47176564e..fd54c3d03c1ad817c416b0acd9d6aaba7ae290e2 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/execTuples.c,v 1.95 2006/06/27 02:51:39 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/execTuples.c,v 1.96 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -718,6 +718,55 @@ ExecFetchSlotTuple(TupleTableSlot *slot)
        return ExecMaterializeSlot(slot);
 }
 
+/* --------------------------------
+ *             ExecFetchSlotMinimalTuple
+ *                     Fetch the slot's minimal physical tuple.
+ *
+ *             If the slot contains a virtual tuple, we convert it to minimal
+ *             physical form.  The slot retains ownership of the physical tuple.
+ *             Likewise, if it contains a regular tuple we convert to minimal form.
+ *
+ * As above, the result must be treated as read-only.
+ * --------------------------------
+ */
+MinimalTuple
+ExecFetchSlotMinimalTuple(TupleTableSlot *slot)
+{
+       MinimalTuple newTuple;
+       MemoryContext oldContext;
+
+       /*
+        * sanity checks
+        */
+       Assert(slot != NULL);
+       Assert(!slot->tts_isempty);
+
+       /*
+        * If we have a minimal physical tuple then just return it.
+        */
+       if (slot->tts_mintuple)
+               return slot->tts_mintuple;
+
+       /*
+        * Otherwise, build a minimal tuple, and then store it as the new slot
+        * value.  (Note: tts_nvalid will be reset to zero here.  There are cases
+        * in which this could be optimized but it's probably not worth worrying
+        * about.)
+        *
+        * We may be called in a context that is shorter-lived than the tuple
+        * slot, but we have to ensure that the materialized tuple will survive
+        * anyway.
+        */
+       oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+       newTuple = ExecCopySlotMinimalTuple(slot);
+       MemoryContextSwitchTo(oldContext);
+
+       ExecStoreMinimalTuple(newTuple, slot, true);
+
+       Assert(slot->tts_mintuple);
+       return slot->tts_mintuple;
+}
+
 /* --------------------------------
  *             ExecMaterializeSlot
  *                     Force a slot into the "materialized" state.
index 5710afb2fca8703df27d4493f8753912eb6f3b01..3c8de3f5e7f5470d669e50ab85697e4e78da3be6 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.103 2006/05/30 14:01:58 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.104 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -92,7 +92,7 @@ MultiExecHash(HashState *node)
                /* We have to compute the hash value */
                econtext->ecxt_innertuple = slot;
                hashvalue = ExecHashGetHashValue(hashtable, econtext, hashkeys);
-               ExecHashTableInsert(hashtable, ExecFetchSlotTuple(slot), hashvalue);
+               ExecHashTableInsert(hashtable, slot, hashvalue);
        }
 
        /* must provide our own instrumentation support */
@@ -358,8 +358,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth,
         * does not allow for any palloc overhead.      The manipulations of spaceUsed
         * don't count palloc overhead either.
         */
-       tupsize = MAXALIGN(sizeof(HashJoinTupleData)) +
-               MAXALIGN(sizeof(HeapTupleHeaderData)) +
+       tupsize = HJTUPLE_OVERHEAD +
+               MAXALIGN(sizeof(MinimalTupleData)) +
                MAXALIGN(tupwidth);
        inner_rel_bytes = ntuples * tupsize;
 
@@ -548,7 +548,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
                        {
                                /* dump it out */
                                Assert(batchno > curbatch);
-                               ExecHashJoinSaveTuple(&tuple->htup, tuple->hashvalue,
+                               ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
+                                                                         tuple->hashvalue,
                                                                          &hashtable->innerBatchFile[batchno]);
                                /* and remove from hash table */
                                if (prevtuple)
@@ -557,7 +558,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
                                        hashtable->buckets[i] = nexttuple;
                                /* prevtuple doesn't change */
                                hashtable->spaceUsed -=
-                                       MAXALIGN(sizeof(HashJoinTupleData)) + tuple->htup.t_len;
+                                       HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
                                pfree(tuple);
                                nfreed++;
                        }
@@ -592,12 +593,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
  * ExecHashTableInsert
  *             insert a tuple into the hash table depending on the hash value
  *             it may just go to a temp file for later batches
+ *
+ * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
+ * tuple; the minimal case in particular is certain to happen while reloading
+ * tuples from batch files.  We could save some cycles in the regular-tuple
+ * case by not forcing the slot contents into minimal form; not clear if it's
+ * worth the messiness required.
  */
 void
 ExecHashTableInsert(HashJoinTable hashtable,
-                                       HeapTuple tuple,
+                                       TupleTableSlot *slot,
                                        uint32 hashvalue)
 {
+       MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
        int                     bucketno;
        int                     batchno;
 
@@ -615,18 +623,11 @@ ExecHashTableInsert(HashJoinTable hashtable,
                HashJoinTuple hashTuple;
                int                     hashTupleSize;
 
-               hashTupleSize = MAXALIGN(sizeof(HashJoinTupleData)) + tuple->t_len;
+               hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
                hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
                                                                                                           hashTupleSize);
                hashTuple->hashvalue = hashvalue;
-               memcpy((char *) &hashTuple->htup,
-                          (char *) tuple,
-                          sizeof(hashTuple->htup));
-               hashTuple->htup.t_data = (HeapTupleHeader)
-                       (((char *) hashTuple) + MAXALIGN(sizeof(HashJoinTupleData)));
-               memcpy((char *) hashTuple->htup.t_data,
-                          (char *) tuple->t_data,
-                          tuple->t_len);
+               memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
                hashTuple->next = hashtable->buckets[bucketno];
                hashtable->buckets[bucketno] = hashTuple;
                hashtable->spaceUsed += hashTupleSize;
@@ -639,7 +640,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
                 * put the tuple into a temp file for later batches
                 */
                Assert(batchno > hashtable->curbatch);
-               ExecHashJoinSaveTuple(tuple, hashvalue,
+               ExecHashJoinSaveTuple(tuple,
+                                                         hashvalue,
                                                          &hashtable->innerBatchFile[batchno]);
        }
 }
@@ -749,7 +751,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
  *
  * The current outer tuple must be stored in econtext->ecxt_outertuple.
  */
-HeapTuple
+HashJoinTuple
 ExecScanHashBucket(HashJoinState *hjstate,
                                   ExprContext *econtext)
 {
@@ -771,14 +773,12 @@ ExecScanHashBucket(HashJoinState *hjstate,
        {
                if (hashTuple->hashvalue == hashvalue)
                {
-                       HeapTuple       heapTuple = &hashTuple->htup;
                        TupleTableSlot *inntuple;
 
                        /* insert hashtable's tuple into exec slot so ExecQual sees it */
-                       inntuple = ExecStoreTuple(heapTuple,
-                                                                         hjstate->hj_HashTupleSlot,
-                                                                         InvalidBuffer,
-                                                                         false);       /* do not pfree */
+                       inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+                                                                                        hjstate->hj_HashTupleSlot,
+                                                                                        false);        /* do not pfree */
                        econtext->ecxt_innertuple = inntuple;
 
                        /* reset temp memory each time to avoid leaks from qual expr */
@@ -787,7 +787,7 @@ ExecScanHashBucket(HashJoinState *hjstate,
                        if (ExecQual(hjclauses, econtext, false))
                        {
                                hjstate->hj_CurTuple = hashTuple;
-                               return heapTuple;
+                               return hashTuple;
                        }
                }
 
index 097343fd88cdfc2bbf1c04142500edac291dcc5d..572aa1a59112456feb821629f2c2b6d44d407b98 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.82 2006/06/16 18:42:22 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.83 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -54,7 +54,7 @@ ExecHashJoin(HashJoinState *node)
        ExprContext *econtext;
        ExprDoneCond isDone;
        HashJoinTable hashtable;
-       HeapTuple       curtuple;
+       HashJoinTuple curtuple;
        TupleTableSlot *outerTupleSlot;
        uint32          hashvalue;
        int                     batchno;
@@ -224,7 +224,7 @@ ExecHashJoin(HashJoinState *node)
                                 * in the corresponding outer-batch file.
                                 */
                                Assert(batchno > hashtable->curbatch);
-                               ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
+                               ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
                                                                          hashvalue,
                                                                          &hashtable->outerBatchFile[batchno]);
                                node->hj_NeedNewOuter = true;
@@ -244,10 +244,9 @@ ExecHashJoin(HashJoinState *node)
                        /*
                         * we've got a match, but still need to test non-hashed quals
                         */
-                       inntuple = ExecStoreTuple(curtuple,
-                                                                         node->hj_HashTupleSlot,
-                                                                         InvalidBuffer,
-                                                                         false);       /* don't pfree this tuple */
+                       inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
+                                                                                        node->hj_HashTupleSlot,
+                                                                                        false);        /* don't pfree */
                        econtext->ecxt_innertuple = inntuple;
 
                        /* reset temp memory each time to avoid leaks from qual expr */
@@ -706,9 +705,7 @@ start_over:
                         * NOTE: some tuples may be sent to future batches.  Also, it is
                         * possible for hashtable->nbatch to be increased here!
                         */
-                       ExecHashTableInsert(hashtable,
-                                                               ExecFetchSlotTuple(slot),
-                                                               hashvalue);
+                       ExecHashTableInsert(hashtable, slot, hashvalue);
                }
 
                /*
@@ -741,15 +738,14 @@ start_over:
  *             save a tuple to a batch file.
  *
  * The data recorded in the file for each tuple is its hash value,
- * then an image of its HeapTupleData (with meaningless t_data pointer)
- * followed by the HeapTupleHeader and tuple data.
+ * then the tuple in MinimalTuple format.
  *
  * Note: it is important always to call this in the regular executor
  * context, not in a shorter-lived context; else the temp file buffers
  * will get messed up.
  */
 void
-ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
+ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
                                          BufFile **fileptr)
 {
        BufFile    *file = *fileptr;
@@ -768,14 +764,8 @@ ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
                                (errcode_for_file_access(),
                                 errmsg("could not write to hash-join temporary file: %m")));
 
-       written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
-       if (written != sizeof(HeapTupleData))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not write to hash-join temporary file: %m")));
-
-       written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
-       if (written != (size_t) heapTuple->t_len)
+       written = BufFileWrite(file, (void *) tuple, tuple->t_len);
+       if (written != tuple->t_len)
                ereport(ERROR,
                                (errcode_for_file_access(),
                                 errmsg("could not write to hash-join temporary file: %m")));
@@ -794,32 +784,36 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
                                                  uint32 *hashvalue,
                                                  TupleTableSlot *tupleSlot)
 {
-       HeapTupleData htup;
+       uint32          header[2];
        size_t          nread;
-       HeapTuple       heapTuple;
+       MinimalTuple tuple;
 
-       nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
-       if (nread == 0)
-               return NULL;                    /* end of file */
-       if (nread != sizeof(uint32))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not read from hash-join temporary file: %m")));
-       nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
-       if (nread != sizeof(HeapTupleData))
+       /*
+        * Since both the hash value and the MinimalTuple length word are
+        * uint32, we can read them both in one BufFileRead() call without
+        * any type cheating.
+        */
+       nread = BufFileRead(file, (void *) header, sizeof(header));
+       if (nread == 0)                 /* end of file */
+       {
+               ExecClearTuple(tupleSlot);
+               return NULL;
+       }
+       if (nread != sizeof(header))
                ereport(ERROR,
                                (errcode_for_file_access(),
                                 errmsg("could not read from hash-join temporary file: %m")));
-       heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
-       memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
-       heapTuple->t_data = (HeapTupleHeader)
-               ((char *) heapTuple + HEAPTUPLESIZE);
-       nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
-       if (nread != (size_t) htup.t_len)
+       *hashvalue = header[0];
+       tuple = (MinimalTuple) palloc(header[1]);
+       tuple->t_len = header[1];
+       nread = BufFileRead(file,
+                                               (void *) ((char *) tuple + sizeof(uint32)),
+                                               header[1] - sizeof(uint32));
+       if (nread != header[1] - sizeof(uint32))
                ereport(ERROR,
                                (errcode_for_file_access(),
                                 errmsg("could not read from hash-join temporary file: %m")));
-       return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
+       return ExecStoreMinimalTuple(tuple, tupleSlot, true);
 }
 
 
index 38cae6251ecc53e1113e63eff6896299adffb0b4..c4e6e460fedf12f7d1e563e2e36c430e2489f1f2 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.38 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.39 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,9 +65,14 @@ typedef struct HashJoinTupleData
 {
        struct HashJoinTupleData *next;         /* link to next tuple in same bucket */
        uint32          hashvalue;              /* tuple's hash code */
-       HeapTupleData htup;                     /* tuple header */
+       /* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
 } HashJoinTupleData;
 
+#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
+#define HJTUPLE_MINTUPLE(hjtup)  \
+       ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
+
+
 typedef struct HashJoinTableData
 {
        int                     nbuckets;               /* # buckets in the in-memory hash table */
index 9a413827d72f88bf4410373e1a2e92a04bcaf0d7..0e0a9b5ec5e02e1c69f4cce558b0d79640d25183 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.40 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.41 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,7 @@ extern void ExecReScanHash(HashState *node, ExprContext *exprCtxt);
 extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators);
 extern void ExecHashTableDestroy(HashJoinTable hashtable);
 extern void ExecHashTableInsert(HashJoinTable hashtable,
-                                       HeapTuple tuple,
+                                       TupleTableSlot *slot,
                                        uint32 hashvalue);
 extern uint32 ExecHashGetHashValue(HashJoinTable hashtable,
                                         ExprContext *econtext,
@@ -35,7 +35,7 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
                                                  uint32 hashvalue,
                                                  int *bucketno,
                                                  int *batchno);
-extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate,
+extern HashJoinTuple ExecScanHashBucket(HashJoinState *hjstate,
                                   ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
index 84f07d3644800aacfc1c5b2d2dd2edf0445d2940..cbbb76230b643c0a1ab195666273dd66ee02920c 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.32 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.33 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,7 +23,7 @@ extern TupleTableSlot *ExecHashJoin(HashJoinState *node);
 extern void ExecEndHashJoin(HashJoinState *node);
 extern void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt);
 
-extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
+extern void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
                                          BufFile **fileptr);
 
 #endif   /* NODEHASHJOIN_H */
index 85318351340e38f4f5a34c67b85eb76f7df676b0..6d5bc02b93d1190cc75d578b2a625b2709cfe513 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/tuptable.h,v 1.32 2006/06/27 02:51:40 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/executor/tuptable.h,v 1.33 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -159,6 +159,7 @@ extern TupleTableSlot *ExecStoreAllNullTuple(TupleTableSlot *slot);
 extern HeapTuple ExecCopySlotTuple(TupleTableSlot *slot);
 extern MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot);
 extern HeapTuple ExecFetchSlotTuple(TupleTableSlot *slot);
+extern MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot);
 extern HeapTuple ExecMaterializeSlot(TupleTableSlot *slot);
 extern TupleTableSlot *ExecCopySlot(TupleTableSlot *dstslot,
                         TupleTableSlot *srcslot);