Change the way pre-reading in external sort's merge phase works.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c

index 774520752f055ded7e6f87a8c555d8e6e0592977..caa6960b95795eaad14a2fba659d203b2d0b510f 100644 (file)
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -52,12 +52,17 @@
   * not clear this helps much, but it can't hurt.  (XXX perhaps a LIFO
   * policy for free blocks would be better?)
   *
+ * To further make the I/Os more sequential, we can use a larger buffer
+ * when reading, and read multiple blocks from the same tape in one go,
+ * whenever the buffer becomes empty.  LogicalTapeAssignReadBufferSize()
+ * can be used to set the size of the read buffer.
+ *
   * To support the above policy of writing to the lowest free block,
   * ltsGetFreeBlock sorts the list of free block numbers into decreasing
   * order each time it is asked for a block and the list isn't currently
   * sorted.  This is an efficient way to handle it because we expect cycles
   * of releasing many blocks followed by re-using many blocks, due to
- * tuplesort.c's "preread" behavior.
+ * the larger read buffer.
   *
   * Since all the bookkeeping and buffer memory is allocated with palloc(),
   * and the underlying file(s) are made with OpenTemporaryFile, all resources
@@ -79,6 +84,7 @@
  
  #include "storage/buffile.h"
  #include "utils/logtape.h"
+#include "utils/memutils.h"
  
  /*
   * Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
          * reading.
          */
         char       *buffer;                     /* physical buffer (separately palloc'd) */
+       int                     buffer_size;    /* allocated size of the buffer */
         long            curBlockNumber; /* this block's logical blk# within tape */
         int                     pos;                    /* next read/write position in buffer */
         int                     nbytes;                 /* total # of valid bytes in buffer */
+
+       /*
+        * Desired buffer size to use when reading.  To keep things simple, we use
+        * a single-block buffer when writing, or when reading a frozen tape.  But
+        * when we are reading and will only read forwards, we allocate a larger
+        * buffer, determined by read_buffer_size.
+        */
+       int                     read_buffer_size;
  } LogicalTape;
  
  /*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
                                                 blocknum)));
  }
  
+/*
+ * Read as many blocks as we can into the per-tape buffer.
+ *
+ * The caller can specify the next physical block number to read, in
+ * datablocknum, or -1 to fetch the next block number from the internal block.
+ * If datablocknum == -1, the caller must've already set curBlockNumber.
+ *
+ * Returns true if anything was read, 'false' on EOF.
+ */
+static bool
+ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt, long datablocknum)
+{
+       lt->pos = 0;
+       lt->nbytes = 0;
+
+       do
+       {
+               /* Fetch next block number (unless provided by caller) */
+               if (datablocknum == -1)
+               {
+                       datablocknum = ltsRecallNextBlockNum(lts, lt->indirect, lt->frozen);
+                       if (datablocknum == -1L)
+                               break;                  /* EOF */
+                       lt->curBlockNumber++;
+               }
+
+               /* Read the block */
+               ltsReadBlock(lts, datablocknum, (void *) (lt->buffer + lt->nbytes));
+               if (!lt->frozen)
+                       ltsReleaseBlock(lts, datablocknum);
+
+               if (lt->curBlockNumber < lt->numFullBlocks)
+                       lt->nbytes += BLCKSZ;
+               else
+               {
+                       /* EOF */
+                       lt->nbytes += lt->lastBlockBytes;
+                       break;
+               }
+
+               /* Advance to next block, if we have buffer space left */
+               datablocknum = -1;
+       } while (lt->nbytes < lt->buffer_size);
+
+       return (lt->nbytes > 0);
+}
+
  /*
   * qsort comparator for sorting freeBlocks[] into decreasing order.
   */
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
                 lt->numFullBlocks = 0L;
                 lt->lastBlockBytes = 0;
                 lt->buffer = NULL;
+               lt->buffer_size = 0;
+               lt->read_buffer_size = BLCKSZ;
                 lt->curBlockNumber = 0L;
                 lt->pos = 0;
                 lt->nbytes = 0;
@@ -628,7 +692,10 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
  
         /* Allocate data buffer and first indirect block on first write */
         if (lt->buffer == NULL)
+       {
                 lt->buffer = (char *) palloc(BLCKSZ);
+               lt->buffer_size = BLCKSZ;
+       }
         if (lt->indirect == NULL)
         {
                 lt->indirect = (IndirectBlock *) palloc(sizeof(IndirectBlock));
@@ -636,6 +703,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
                 lt->indirect->nextup = NULL;
         }
  
+       Assert(lt->buffer_size == BLCKSZ);
         while (size > 0)
         {
                 if (lt->pos >= BLCKSZ)
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
                         Assert(lt->frozen);
                         datablocknum = ltsRewindFrozenIndirectBlock(lts, lt->indirect);
                 }
+
+               /* Allocate a read buffer */
+               if (lt->buffer)
+                       pfree(lt->buffer);
+               lt->buffer = palloc(lt->read_buffer_size);
+               lt->buffer_size = lt->read_buffer_size;
+
                 /* Read the first block, or reset if tape is empty */
                 lt->curBlockNumber = 0L;
                 lt->pos = 0;
                 lt->nbytes = 0;
                 if (datablocknum != -1L)
-               {
-                       ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-                       if (!lt->frozen)
-                               ltsReleaseBlock(lts, datablocknum);
-                       lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
-                               BLCKSZ : lt->lastBlockBytes;
-               }
+                       ltsReadFillBuffer(lts, lt, datablocknum);
         }
         else
         {
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
                 lt->curBlockNumber = 0L;
                 lt->pos = 0;
                 lt->nbytes = 0;
+
+               if (lt->buffer)
+               {
+                       pfree(lt->buffer);
+                       lt->buffer = NULL;
+                       lt->buffer_size = 0;
+               }
         }
  }
  
@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
                 if (lt->pos >= lt->nbytes)
                 {
                         /* Try to load more data into buffer. */
-                       long            datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
-                                                                                                                        lt->frozen);
-
-                       if (datablocknum == -1L)
+                       if (!ltsReadFillBuffer(lts, lt, -1))
                                 break;                  /* EOF */
-                       lt->curBlockNumber++;
-                       lt->pos = 0;
-                       ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-                       if (!lt->frozen)
-                               ltsReleaseBlock(lts, datablocknum);
-                       lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
-                               BLCKSZ : lt->lastBlockBytes;
-                       if (lt->nbytes <= 0)
-                               break;                  /* EOF (possible here?) */
                 }
  
                 nthistime = lt->nbytes - lt->pos;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
         lt->writing = false;
         lt->frozen = true;
         datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, true);
+
+       /*
+        * The seek and backspace functions assume a single block read buffer.
+        * That's OK with current usage.  A larger buffer is helpful to make the
+        * read pattern of the backing file look more sequential to the OS, when
+        * we're reading from multiple tapes.  But at the end of a sort, when a
+        * tape is frozen, we only read from a single tape anyway.
+        */
+       if (!lt->buffer || lt->buffer_size != BLCKSZ)
+       {
+               if (lt->buffer)
+                       pfree(lt->buffer);
+               lt->buffer = palloc(BLCKSZ);
+               lt->buffer_size = BLCKSZ;
+       }
+
         /* Read the first block, or reset if tape is empty */
         lt->curBlockNumber = 0L;
         lt->pos = 0;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
         Assert(tapenum >= 0 && tapenum < lts->nTapes);
         lt = &lts->tapes[tapenum];
         Assert(lt->frozen);
+       Assert(lt->buffer_size == BLCKSZ);
  
         /*
          * Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
         lt = &lts->tapes[tapenum];
         Assert(lt->frozen);
         Assert(offset >= 0 && offset <= BLCKSZ);
+       Assert(lt->buffer_size == BLCKSZ);
  
         /*
          * Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
  
         Assert(tapenum >= 0 && tapenum < lts->nTapes);
         lt = &lts->tapes[tapenum];
+
+       /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
+       Assert(lt->buffer_size == BLCKSZ);
+
         *blocknum = lt->curBlockNumber;
         *offset = lt->pos;
  }
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
  {
         return lts->nFileBlocks;
  }
+
+/*
+ * Set buffer size to use, when reading from given tape.
+ */
+void
+LogicalTapeAssignReadBufferSize(LogicalTapeSet *lts, int tapenum, size_t avail_mem)
+{
+       LogicalTape *lt;
+
+       Assert(tapenum >= 0 && tapenum < lts->nTapes);
+       lt = &lts->tapes[tapenum];
+
+       /*
+        * The buffer size must be a multiple of BLCKSZ in size, so round the
+        * given value down to nearest BLCKSZ.  Make sure we have at least one
+        * page. Also, don't go above MaxAllocSize, to avoid erroring out.  A
+        * multi-gigabyte buffer is unlikely to be helpful, anyway.
+        */
+       if (avail_mem < BLCKSZ)
+               avail_mem = BLCKSZ;
+       if (avail_mem > MaxAllocSize)
+               avail_mem = MaxAllocSize;
+       avail_mem -= avail_mem % BLCKSZ;
+       lt->read_buffer_size = avail_mem;
+}
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index 16ceb30b2737f97e417890ec976905f9302ac31a..20cfb0b139cca44551fd46872ce45e7701b203ad 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -74,7 +74,7 @@
   * the merge is complete.  The basic merge algorithm thus needs very little
   * memory --- only M tuples for an M-way merge, and M is constrained to a
   * small number.  However, we can still make good use of our full workMem
- * allocation by pre-reading additional tuples from each source tape.  Without
+ * allocation by pre-reading additional blocks from each source tape.  Without
   * prereading, our access pattern to the temporary file would be very erratic;
   * on average we'd read one block from each of M source tapes during the same
   * time that we're writing M blocks to the output tape, so there is no
@@ -84,10 +84,10 @@
   * worse when it comes time to read that tape.  A straightforward merge pass
   * thus ends up doing a lot of waiting for disk seeks.  We can improve matters
   * by prereading from each source tape sequentially, loading about workMem/M
- * bytes from each tape in turn.  Then we run the merge algorithm, writing but
- * not reading until one of the preloaded tuple series runs out.  Then we
- * switch back to preread mode, fill memory again, and repeat.  This approach
- * helps to localize both read and write accesses.
+ * bytes from each tape in turn, and making the sequential blocks immediately
+ * available for reuse.  This approach helps to localize both read and write
+ * accesses.  The pre-reading is handled by logtape.c, we just tell it how
+ * much memory to use for the buffers.
   *
   * When the caller requests random access to the sort result, we form
   * the final sorted run on a logical tape which is then "frozen", so
@@ -162,9 +162,9 @@ bool                optimize_bounded_sort = true;
   * The objects we actually sort are SortTuple structs.  These contain
   * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
   * which is a separate palloc chunk --- we assume it is just one chunk and
- * can be freed by a simple pfree() (except during final on-the-fly merge,
- * when memory is used in batch).  SortTuples also contain the tuple's
- * first key column in Datum/nullflag format, and an index integer.
+ * can be freed by a simple pfree() (except during merge, when we use a
+ * simple slab allocator).  SortTuples also contain the tuple's first key
+ * column in Datum/nullflag format, and an index integer.
   *
   * Storing the first key column lets us save heap_getattr or index_getattr
   * calls during tuple comparisons.  We could extract and save all the key
@@ -191,9 +191,8 @@ bool                optimize_bounded_sort = true;
   * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
   * selection is always abandoned after the first run; no other run number
   * should be represented here.  During merge passes, we re-use it to hold the
- * input tape number that each tuple in the heap was read from, or to hold the
- * index of the next tuple pre-read from the same tape in the case of pre-read
- * entries.  tupindex goes unused if the sort occurs entirely in memory.
+ * input tape number that each tuple in the heap was read from.  tupindex goes
+ * unused if the sort occurs entirely in memory.
   */
  typedef struct
  {
@@ -203,6 +202,24 @@ typedef struct
         int                     tupindex;               /* see notes above */
  } SortTuple;
  
+/*
+ * During merge, we use a pre-allocated set of fixed-size slots to hold
+ * tuples.  To avoid palloc/pfree overhead.
+ *
+ * Merge doesn't require a lot of memory, so we can afford to waste some,
+ * by using gratuitously-sized slots.  If a tuple is larger than 1 kB, the
+ * palloc() overhead is not significant anymore.
+ *
+ * 'nextfree' is valid when this chunk is in the free list.  When in use, the
+ * slot holds a tuple.
+ */
+#define SLAB_SLOT_SIZE 1024
+
+typedef union SlabSlot
+{
+       union SlabSlot *nextfree;
+       char            buffer[SLAB_SLOT_SIZE];
+} SlabSlot;
  
  /*
   * Possible states of a Tuplesort object.  These denote the states that
@@ -288,41 +305,28 @@ struct Tuplesortstate
         /*
          * Function to write a stored tuple onto tape.  The representation of the
          * tuple on tape need not be the same as it is in memory; requirements on
-        * the tape representation are given below.  After writing the tuple,
-        * pfree() the out-of-line data (not the SortTuple struct!), and increase
-        * state->availMem by the amount of memory space thereby released.
+        * the tape representation are given below.  Unless the slab allocator is
+        * used, after writing the tuple, pfree() the out-of-line data (not the
+        * SortTuple struct!), and increase state->availMem by the amount of
+        * memory space thereby released.
          */
         void            (*writetup) (Tuplesortstate *state, int tapenum,
                                                                                  SortTuple *stup);
  
         /*
          * Function to read a stored tuple from tape back into memory. 'len' is
-        * the already-read length of the stored tuple.  Create a palloc'd copy,
-        * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
-        * decrease state->availMem by the amount of memory space consumed. (See
-        * batchUsed notes for details on how memory is handled when incremental
-        * accounting is abandoned.)
+        * the already-read length of the stored tuple.  The tuple is allocated
+        * from the slab memory arena, or is palloc'd, see readtup_alloc().
          */
         void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
                                                                                 int tapenum, unsigned int len);
  
-       /*
-        * Function to move a caller tuple.  This is usually implemented as a
-        * memmove() shim, but function may also perform additional fix-up of
-        * caller tuple where needed.  Batch memory support requires the movement
-        * of caller tuples from one location in memory to another.
-        */
-       void            (*movetup) (void *dest, void *src, unsigned int len);
-
         /*
          * This array holds the tuples now in sort memory.  If we are in state
          * INITIAL, the tuples are in no particular order; if we are in state
          * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
          * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
-        * H.  (Note that memtupcount only counts the tuples that are part of the
-        * heap --- during merge passes, memtuples[] entries beyond tapeRange are
-        * never in the heap and are used to hold pre-read tuples.)  In state
-        * SORTEDONTAPE, the array is not used.
+        * H.  In state SORTEDONTAPE, the array is not used.
          */
         SortTuple  *memtuples;          /* array of SortTuple structs */
         int                     memtupcount;    /* number of tuples currently present */
@@ -330,13 +334,45 @@ struct Tuplesortstate
         bool            growmemtuples;  /* memtuples' growth still underway? */
  
         /*
-        * Memory for tuples is sometimes allocated in batch, rather than
-        * incrementally.  This implies that incremental memory accounting has
-        * been abandoned.  Currently, this only happens for the final on-the-fly
-        * merge step.  Large batch allocations can store tuples (e.g.
-        * IndexTuples) without palloc() fragmentation and other overhead.
+        * Memory for tuples is sometimes allocated using a simple slab allocator,
+        * rather than with palloc().  Currently, we switch to slab allocation
+        * when we start merging.  Merging only needs to keep a small, fixed
+        * number of tuples in memory at any time, so we can avoid the
+        * palloc/pfree overhead by recycling a fixed number of fixed-size slots
+        * to hold the tuples.
+        *
+        * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
+        * slots.  The allocation is sized to have one slot per tape, plus one
+        * additional slot.  We need that many slots to hold all the tuples kept
+        * in the heap during merge, plus the one we have last returned from the
+        * sort, with tuplesort_gettuple.
+        *
+        * Initially, all the slots are kept in a linked list of free slots.  When
+        * a tuple is read from a tape, it is put to the next available slot, if
+        * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
+        * instead.
+        *
+        * When we're done processing a tuple, we return the slot back to the free
+        * list, or pfree() if it was palloc'd.  We know that a tuple was
+        * allocated from the slab, if its pointer value is between
+        * slabMemoryBegin and -End.
+        *
+        * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
+        * tracking memory usage is not used.
+        */
+       bool            slabAllocatorUsed;
+
+       char       *slabMemoryBegin;    /* beginning of slab memory arena */
+       char       *slabMemoryEnd;      /* end of slab memory arena */
+       SlabSlot   *slabFreeHead;       /* head of free list */
+
+       /*
+        * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
+        * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
+        * modes), we remember the tuple in 'lastReturnedTuple', so that we can
+        * recycle the memory on next gettuple call.
          */
-       bool            batchUsed;
+       void       *lastReturnedTuple;
  
         /*
          * While building initial runs, this indicates if the replacement
@@ -358,42 +394,11 @@ struct Tuplesortstate
          */
  
         /*
-        * These variables are only used during merge passes.  mergeactive[i] is
-        * true if we are reading an input run from (actual) tape number i and
-        * have not yet exhausted that run.  mergenext[i] is the memtuples index
-        * of the next pre-read tuple (next to be loaded into the heap) for tape
-        * i, or 0 if we are out of pre-read tuples.  mergelast[i] similarly
-        * points to the last pre-read tuple from each tape.  mergeavailslots[i]
-        * is the number of unused memtuples[] slots reserved for tape i, and
-        * mergeavailmem[i] is the amount of unused space allocated for tape i.
-        * mergefreelist and mergefirstfree keep track of unused locations in the
-        * memtuples[] array.  The memtuples[].tupindex fields link together
-        * pre-read tuples for each tape as well as recycled locations in
-        * mergefreelist. It is OK to use 0 as a null link in these lists, because
-        * memtuples[0] is part of the merge heap and is never a pre-read tuple.
+        * This variable is only used during merge passes.  mergeactive[i] is true
+        * if we are reading an input run from (actual) tape number i and have not
+        * yet exhausted that run.
          */
         bool       *mergeactive;        /* active input run source? */
-       int                *mergenext;          /* first preread tuple for each source */
-       int                *mergelast;          /* last preread tuple for each source */
-       int                *mergeavailslots;    /* slots left for prereading each tape */
-       int64      *mergeavailmem;      /* availMem for prereading each tape */
-       int                     mergefreelist;  /* head of freelist of recycled slots */
-       int                     mergefirstfree; /* first slot never used in this merge */
-
-       /*
-        * Per-tape batch state, when final on-the-fly merge consumes memory from
-        * just a few large allocations.
-        *
-        * Aside from the general benefits of performing fewer individual retail
-        * palloc() calls, this also helps make merging more cache efficient,
-        * since each tape's tuples must naturally be accessed sequentially (in
-        * sorted order).
-        */
-       int64           spacePerTape;   /* Space (memory) for tuples (not slots) */
-       char      **mergetuples;        /* Each tape's memory allocation */
-       char      **mergecurrent;       /* Current offset into each tape's memory */
-       char      **mergetail;          /* Last item's start point for each tape */
-       char      **mergeoverflow;      /* Retail palloc() "overflow" for each tape */
  
         /*
          * Variables for Algorithm D.  Note that destTape is a "logical" tape
@@ -481,12 +486,34 @@ struct Tuplesortstate
  #endif
  };
  
+/*
+ * Is the given tuple allocated from the slab memory arena?
+ */
+#define IS_SLAB_SLOT(state, tuple) \
+       ((char *) (tuple) >= (state)->slabMemoryBegin && \
+        (char *) (tuple) < (state)->slabMemoryEnd)
+
+/*
+ * Return the given tuple to the slab memory free list, or free it
+ * if it was palloc'd.
+ */
+#define RELEASE_SLAB_SLOT(state, tuple) \
+       do { \
+               SlabSlot *buf = (SlabSlot *) tuple; \
+               \
+               if (IS_SLAB_SLOT((state), buf)) \
+               { \
+                       buf->nextfree = (state)->slabFreeHead; \
+                       (state)->slabFreeHead = buf; \
+               } else \
+                       pfree(buf); \
+       } while(0)
+
  #define COMPARETUP(state,a,b)  ((*(state)->comparetup) (a, b, state))
  #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
  #define WRITETUP(state,tape,stup)      ((*(state)->writetup) (state, tape, stup))
  #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
-#define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
-#define LACKMEM(state)         ((state)->availMem < 0 && !(state)->batchUsed)
+#define LACKMEM(state)         ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
  #define USEMEM(state,amt)      ((state)->availMem -= (amt))
  #define FREEMEM(state,amt)     ((state)->availMem += (amt))
  
@@ -551,18 +578,12 @@ static bool consider_abort_common(Tuplesortstate *state);
  static bool useselection(Tuplesortstate *state);
  static void inittapes(Tuplesortstate *state);
  static void selectnewtape(Tuplesortstate *state);
+static void init_slab_allocator(Tuplesortstate *state, int numSlots);
+static void init_tape_buffers(Tuplesortstate *state, int numInputTapes);
  static void mergeruns(Tuplesortstate *state);
  static void mergeonerun(Tuplesortstate *state);
-static void beginmerge(Tuplesortstate *state, bool finalMergeBatch);
-static void batchmemtuples(Tuplesortstate *state);
-static void mergebatch(Tuplesortstate *state, int64 spacePerTape);
-static void mergebatchone(Tuplesortstate *state, int srcTape,
-                         SortTuple *stup, bool *should_free);
-static void mergebatchfreetape(Tuplesortstate *state, int srcTape,
-                                  SortTuple *rtup, bool *should_free);
-static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen);
-static void mergepreread(Tuplesortstate *state);
-static void mergeprereadone(Tuplesortstate *state, int srcTape);
+static void beginmerge(Tuplesortstate *state);
+static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
  static void dumptuples(Tuplesortstate *state, bool alltuples);
  static void dumpbatch(Tuplesortstate *state, bool alltuples);
  static void make_bounded_heap(Tuplesortstate *state);
@@ -576,7 +597,7 @@ static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex);
  static void reversedirection(Tuplesortstate *state);
  static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
  static void markrunend(Tuplesortstate *state, int tapenum);
-static void *readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen);
+static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
  static int comparetup_heap(const SortTuple *a, const SortTuple *b,
                                 Tuplesortstate *state);
  static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -584,7 +605,6 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
                           SortTuple *stup);
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
                          int tapenum, unsigned int len);
-static void movetup_heap(void *dest, void *src, unsigned int len);
  static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
                                    Tuplesortstate *state);
  static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -592,7 +612,6 @@ static void writetup_cluster(Tuplesortstate *state, int tapenum,
                                  SortTuple *stup);
  static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
                                 int tapenum, unsigned int len);
-static void movetup_cluster(void *dest, void *src, unsigned int len);
  static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
                                            Tuplesortstate *state);
  static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
@@ -602,7 +621,6 @@ static void writetup_index(Tuplesortstate *state, int tapenum,
                            SortTuple *stup);
  static void readtup_index(Tuplesortstate *state, SortTuple *stup,
                           int tapenum, unsigned int len);
-static void movetup_index(void *dest, void *src, unsigned int len);
  static int comparetup_datum(const SortTuple *a, const SortTuple *b,
                                  Tuplesortstate *state);
  static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -610,7 +628,6 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
                            SortTuple *stup);
  static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
                           int tapenum, unsigned int len);
-static void movetup_datum(void *dest, void *src, unsigned int len);
  static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  
  /*
@@ -705,7 +722,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
                                                 ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
  
         state->growmemtuples = true;
-       state->batchUsed = false;
+       state->slabAllocatorUsed = false;
         state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
  
         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
@@ -762,7 +779,6 @@ tuplesort_begin_heap(TupleDesc tupDesc,
         state->copytup = copytup_heap;
         state->writetup = writetup_heap;
         state->readtup = readtup_heap;
-       state->movetup = movetup_heap;
  
         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
         state->abbrevNext = 10;
@@ -835,7 +851,6 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
         state->copytup = copytup_cluster;
         state->writetup = writetup_cluster;
         state->readtup = readtup_cluster;
-       state->movetup = movetup_cluster;
         state->abbrevNext = 10;
  
         state->indexInfo = BuildIndexInfo(indexRel);
@@ -927,7 +942,6 @@ tuplesort_begin_index_btree(Relation heapRel,
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
-       state->movetup = movetup_index;
         state->abbrevNext = 10;
  
         state->heapRel = heapRel;
@@ -995,7 +1009,6 @@ tuplesort_begin_index_hash(Relation heapRel,
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
-       state->movetup = movetup_index;
  
         state->heapRel = heapRel;
         state->indexRel = indexRel;
@@ -1038,7 +1051,6 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
         state->copytup = copytup_datum;
         state->writetup = writetup_datum;
         state->readtup = readtup_datum;
-       state->movetup = movetup_datum;
         state->abbrevNext = 10;
  
         state->datumType = datumType;
@@ -1838,7 +1850,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
         {
                 case TSS_SORTEDINMEM:
                         Assert(forward || state->randomAccess);
-                       Assert(!state->batchUsed);
+                       Assert(!state->slabAllocatorUsed);
                         *should_free = false;
                         if (forward)
                         {
@@ -1883,15 +1895,35 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
  
                 case TSS_SORTEDONTAPE:
                         Assert(forward || state->randomAccess);
-                       Assert(!state->batchUsed);
-                       *should_free = true;
+                       Assert(state->slabAllocatorUsed);
+
+                       /*
+                        * The slot that held the tuple that we returned in previous
+                        * gettuple call can now be reused.
+                        */
+                       if (state->lastReturnedTuple)
+                       {
+                               RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
+                               state->lastReturnedTuple = NULL;
+                       }
+
                         if (forward)
                         {
                                 if (state->eof_reached)
                                         return false;
+
                                 if ((tuplen = getlen(state, state->result_tape, true)) != 0)
                                 {
                                         READTUP(state, stup, state->result_tape, tuplen);
+
+                                       /*
+                                        * Remember the tuple we return, so that we can recycle
+                                        * its memory on next call.  (This can be NULL, in the
+                                        * !state->tuples case).
+                                        */
+                                       state->lastReturnedTuple = stup->tuple;
+
+                                       *should_free = false;
                                         return true;
                                 }
                                 else
@@ -1965,74 +1997,70 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
                                                                           tuplen))
                                 elog(ERROR, "bogus tuple length in backward scan");
                         READTUP(state, stup, state->result_tape, tuplen);
+
+                       /*
+                        * Remember the tuple we return, so that we can recycle its memory
+                        * on next call. (This can be NULL, in the Datum case).
+                        */
+                       state->lastReturnedTuple = stup->tuple;
+
+                       *should_free = false;
                         return true;
  
                 case TSS_FINALMERGE:
                         Assert(forward);
-                       Assert(state->batchUsed || !state->tuples);
-                       /* For now, assume tuple is stored in tape's batch memory */
+                       /* We are managing memory ourselves, with the slab allocator. */
+                       Assert(state->slabAllocatorUsed);
                         *should_free = false;
  
+                       /*
+                        * The slab slot holding the tuple that we returned in previous
+                        * gettuple call can now be reused.
+                        */
+                       if (state->lastReturnedTuple)
+                       {
+                               RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
+                               state->lastReturnedTuple = NULL;
+                       }
+
                         /*
                          * This code should match the inner loop of mergeonerun().
                          */
                         if (state->memtupcount > 0)
                         {
                                 int                     srcTape = state->memtuples[0].tupindex;
-                               int                     tupIndex;
-                               SortTuple  *newtup;
+                               SortTuple       newtup;
+
+                               *stup = state->memtuples[0];
  
                                 /*
-                                * Returned tuple is still counted in our memory space most of
-                                * the time.  See mergebatchone() for discussion of why caller
-                                * may occasionally be required to free returned tuple, and
-                                * how preread memory is managed with regard to edge cases
-                                * more generally.
+                                * Remember the tuple we return, so that we can recycle its
+                                * memory on next call. (This can be NULL, in the Datum case).
                                  */
-                               *stup = state->memtuples[0];
-                               if ((tupIndex = state->mergenext[srcTape]) == 0)
+                               state->lastReturnedTuple = stup->tuple;
+
+                               /*
+                                * Pull next tuple from tape, and replace the returned tuple
+                                * at top of the heap with it.
+                                */
+                               if (!mergereadnext(state, srcTape, &newtup))
                                 {
                                         /*
-                                        * out of preloaded data on this tape, try to read more
-                                        *
-                                        * Unlike mergeonerun(), we only preload from the single
-                                        * tape that's run dry, though not before preparing its
-                                        * batch memory for a new round of sequential consumption.
-                                        * See mergepreread() comments.
+                                        * If no more data, we've reached end of run on this tape.
+                                        * Remove the top node from the heap.
                                          */
-                                       if (state->batchUsed)
-                                               mergebatchone(state, srcTape, stup, should_free);
-
-                                       mergeprereadone(state, srcTape);
+                                       tuplesort_heap_delete_top(state, false);
  
                                         /*
-                                        * if still no data, we've reached end of run on this tape
+                                        * Rewind to free the read buffer.  It'd go away at the
+                                        * end of the sort anyway, but better to release the
+                                        * memory early.
                                          */
-                                       if ((tupIndex = state->mergenext[srcTape]) == 0)
-                                       {
-                                               /* Remove the top node from the heap */
-                                               tuplesort_heap_delete_top(state, false);
-                                               /* Free tape's buffer, avoiding dangling pointer */
-                                               if (state->batchUsed)
-                                                       mergebatchfreetape(state, srcTape, stup, should_free);
-                                               return true;
-                                       }
+                                       LogicalTapeRewind(state->tapeset, srcTape, true);
+                                       return true;
                                 }
-
-                               /*
-                                * pull next preread tuple from list, and replace the returned
-                                * tuple at top of the heap with it.
-                                */
-                               newtup = &state->memtuples[tupIndex];
-                               state->mergenext[srcTape] = newtup->tupindex;
-                               if (state->mergenext[srcTape] == 0)
-                                       state->mergelast[srcTape] = 0;
-                               newtup->tupindex = srcTape;
-                               tuplesort_heap_replace_top(state, newtup, false);
-                               /* put the now-unused memtuples entry on the freelist */
-                               newtup->tupindex = state->mergefreelist;
-                               state->mergefreelist = tupIndex;
-                               state->mergeavailslots[srcTape]++;
+                               newtup.tupindex = srcTape;
+                               tuplesort_heap_replace_top(state, &newtup, false);
                                 return true;
                         }
                         return false;
@@ -2317,13 +2345,6 @@ inittapes(Tuplesortstate *state)
         /* Compute number of tapes to use: merge order plus 1 */
         maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
  
-       /*
-        * We must have at least 2*maxTapes slots in the memtuples[] array, else
-        * we'd not have room for merge heap plus preread.  It seems unlikely that
-        * this case would ever occur, but be safe.
-        */
-       maxTapes = Min(maxTapes, state->memtupsize / 2);
-
         state->maxTapes = maxTapes;
         state->tapeRange = maxTapes - 1;
  
@@ -2334,13 +2355,13 @@ inittapes(Tuplesortstate *state)
  #endif
  
         /*
-        * Decrease availMem to reflect the space needed for tape buffers; but
-        * don't decrease it to the point that we have no room for tuples. (That
-        * case is only likely to occur if sorting pass-by-value Datums; in all
-        * other scenarios the memtuples[] array is unlikely to occupy more than
-        * half of allowedMem.  In the pass-by-value case it's not important to
-        * account for tuple space, so we don't care if LACKMEM becomes
-        * inaccurate.)
+        * Decrease availMem to reflect the space needed for tape buffers, when
+        * writing the initial runs; but don't decrease it to the point that we
+        * have no room for tuples.  (That case is only likely to occur if sorting
+        * pass-by-value Datums; in all other scenarios the memtuples[] array is
+        * unlikely to occupy more than half of allowedMem.  In the pass-by-value
+        * case it's not important to account for tuple space, so we don't care if
+        * LACKMEM becomes inaccurate.)
          */
         tapeSpace = (int64) maxTapes *TAPE_BUFFER_OVERHEAD;
  
@@ -2359,14 +2380,6 @@ inittapes(Tuplesortstate *state)
         state->tapeset = LogicalTapeSetCreate(maxTapes);
  
         state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
-       state->mergenext = (int *) palloc0(maxTapes * sizeof(int));
-       state->mergelast = (int *) palloc0(maxTapes * sizeof(int));
-       state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int));
-       state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64));
-       state->mergetuples = (char **) palloc0(maxTapes * sizeof(char *));
-       state->mergecurrent = (char **) palloc0(maxTapes * sizeof(char *));
-       state->mergetail = (char **) palloc0(maxTapes * sizeof(char *));
-       state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *));
         state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
         state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
         state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
@@ -2465,6 +2478,105 @@ selectnewtape(Tuplesortstate *state)
         state->destTape = 0;
  }
  
+/*
+ * Initialize the slab allocation arena, for the given number of slots.
+ */
+static void
+init_slab_allocator(Tuplesortstate *state, int numSlots)
+{
+       if (numSlots > 0)
+       {
+               char       *p;
+               int                     i;
+
+               state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
+               state->slabMemoryEnd = state->slabMemoryBegin +
+                       numSlots * SLAB_SLOT_SIZE;
+               state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
+               USEMEM(state, numSlots * SLAB_SLOT_SIZE);
+
+               p = state->slabMemoryBegin;
+               for (i = 0; i < numSlots - 1; i++)
+               {
+                       ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
+                       p += SLAB_SLOT_SIZE;
+               }
+               ((SlabSlot *) p)->nextfree = NULL;
+       }
+       else
+       {
+               state->slabMemoryBegin = state->slabMemoryEnd = NULL;
+               state->slabFreeHead = NULL;
+       }
+       state->slabAllocatorUsed = true;
+}
+
+/*
+ * Divide all remaining work memory (availMem) as read buffers, for all
+ * the tapes that will be used during the merge.
+ *
+ * We use the number of possible *input* tapes here, rather than maxTapes,
+ * for the calculation.  At all times, we'll be reading from at most
+ * numInputTapes tapes, and one tape is used for output (unless we do an
+ * on-the-fly final merge, in which case we don't have an output tape).
+ */
+static void
+init_tape_buffers(Tuplesortstate *state, int numInputTapes)
+{
+       int64           availBlocks;
+       int64           blocksPerTape;
+       int                     remainder;
+       int                     tapenum;
+
+       /*
+        * Divide availMem evenly among the number of input tapes.
+        */
+       availBlocks = state->availMem / BLCKSZ;
+       blocksPerTape = availBlocks / numInputTapes;
+       remainder = availBlocks % numInputTapes;
+       USEMEM(state, availBlocks * BLCKSZ);
+
+#ifdef TRACE_SORT
+       if (trace_sort)
+               elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
+                        (availBlocks * BLCKSZ) / 1024, numInputTapes);
+#endif
+
+       /*
+        * Use one page per tape, even if we are out of memory.
+        * tuplesort_merge_order() should've chosen the number of tapes so that
+        * this can't happen, but better safe than sorry.  (This also protects
+        * from a negative availMem.)
+        */
+       if (blocksPerTape < 1)
+       {
+               blocksPerTape = 1;
+               remainder = 0;
+       }
+
+       /*
+        * Set the buffers for the tapes.
+        *
+        * In a multi-phase merge, the tape that is initially used as an output
+        * tape, will later be rewound and read from, and should also use a large
+        * buffer at that point.  So we must loop up to maxTapes, not just
+        * numInputTapes!
+        *
+        * If there are fewer runs than tapes, we will set the buffer size also
+        * for tapes that will go completely unused, but that's harmless.
+        * LogicalTapeAssignReadBufferSize() doesn't allocate the buffer
+        * immediately, it just sets the size that will be used, when the tape is
+        * rewound for read, and the tape isn't empty.
+        */
+       for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
+       {
+               int64           numBlocks = blocksPerTape + (tapenum < remainder ? 1 : 0);
+
+               LogicalTapeAssignReadBufferSize(state->tapeset, tapenum,
+                                                                               numBlocks * BLCKSZ);
+       }
+}
+
  /*
   * mergeruns -- merge all the completed initial runs.
   *
@@ -2478,6 +2590,8 @@ mergeruns(Tuplesortstate *state)
                                 svTape,
                                 svRuns,
                                 svDummy;
+       int                     numTapes;
+       int                     numInputTapes;
  
         Assert(state->status == TSS_BUILDRUNS);
         Assert(state->memtupcount == 0);
@@ -2498,6 +2612,56 @@ mergeruns(Tuplesortstate *state)
                 state->sortKeys->abbrev_full_comparator = NULL;
         }
  
+       /*
+        * Reset tuple memory.  We've freed all the tuples that we previously
+        * allocated.  We will use the slab allocator from now on.
+        */
+       MemoryContextDelete(state->tuplecontext);
+       state->tuplecontext = NULL;
+
+       /*
+        * We no longer need a large memtuples array.  (We will allocate a smaller
+        * one for the heap later.)
+        */
+       FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
+       pfree(state->memtuples);
+       state->memtuples = NULL;
+
+       /*
+        * If we had fewer runs than tapes, refund the memory that we imagined we
+        * would need for the tape buffers of the unused tapes.
+        *
+        * numTapes and numInputTapes reflect the actual number of tapes we will
+        * use.  Note that the output tape's tape number is maxTapes - 1, so the
+        * tape numbers of the used tapes are not consecutive, and you cannot just
+        * loop from 0 to numTapes to visit all used tapes!
+        */
+       if (state->Level == 1)
+       {
+               numInputTapes = state->currentRun;
+               numTapes = numInputTapes + 1;
+               FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
+       }
+       else
+       {
+               numInputTapes = state->tapeRange;
+               numTapes = state->maxTapes;
+       }
+
+       /*
+        * Initialize the slab allocator.  We need one slab slot per input tape,
+        * for the tuples in the heap, plus one to hold the tuple last returned
+        * from tuplesort_gettuple.  (If we're sorting pass-by-val Datums,
+        * however, we don't need to do allocate anything.)
+        *
+        * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
+        * to track memory usage of individual tuples.
+        */
+       if (state->tuples)
+               init_slab_allocator(state, numInputTapes + 1);
+       else
+               init_slab_allocator(state, 0);
+
         /*
          * If we produced only one initial run (quite likely if the total data
          * volume is between 1X and 2X workMem when replacement selection is used,
@@ -2514,6 +2678,35 @@ mergeruns(Tuplesortstate *state)
                 return;
         }
  
+       /*
+        * Use all the spare memory we have available for read buffers for the
+        * tapes.
+        *
+        * We do this only after checking for the case that we produced only one
+        * initial run, because there is no need to use a large read buffer when
+        * we're reading from a single tape.  With one tape, the I/O pattern will
+        * be the same regardless of the buffer size.
+        *
+        * We don't try to "rebalance" the amount of memory among tapes, when we
+        * start a new merge phase, even if some tapes can be inactive in the
+        * phase.  That would be hard, because logtape.c doesn't know where one
+        * run ends and another begins.  When a new merge phase begins, and a tape
+        * doesn't participate in it, its buffer nevertheless already contains
+        * tuples from the next run on same tape, so we cannot release the buffer.
+        * That's OK in practice, merge performance isn't that sensitive to the
+        * amount of buffers used, and most merge phases use all or almost all
+        * tapes, anyway.
+        */
+       init_tape_buffers(state, numInputTapes);
+
+       /*
+        * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
+        * from each input tape.
+        */
+       state->memtupsize = numInputTapes;
+       state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
+       USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+
         /* End of step D2: rewind all output tapes to prepare for merging */
         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
                 LogicalTapeRewind(state->tapeset, tapenum, false);
@@ -2544,7 +2737,7 @@ mergeruns(Tuplesortstate *state)
                                 /* Tell logtape.c we won't be writing anymore */
                                 LogicalTapeSetForgetFreeSpace(state->tapeset);
                                 /* Initialize for the final merge pass */
-                               beginmerge(state, state->tuples);
+                               beginmerge(state);
                                 state->status = TSS_FINALMERGE;
                                 return;
                         }
@@ -2614,6 +2807,13 @@ mergeruns(Tuplesortstate *state)
         state->result_tape = state->tp_tapenum[state->tapeRange];
         LogicalTapeFreeze(state->tapeset, state->result_tape);
         state->status = TSS_SORTEDONTAPE;
+
+       /* Release the read buffers of all the other tapes, by rewinding them. */
+       for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
+       {
+               if (tapenum != state->result_tape)
+                       LogicalTapeRewind(state->tapeset, tapenum, true);
+       }
  }
  
  /*
@@ -2627,16 +2827,12 @@ mergeonerun(Tuplesortstate *state)
  {
         int                     destTape = state->tp_tapenum[state->tapeRange];
         int                     srcTape;
-       int                     tupIndex;
-       SortTuple  *tup;
-       int64           priorAvail,
-                               spaceFreed;
  
         /*
          * Start the merge by loading one tuple from each active source tape into
          * the heap.  We can also decrease the input run/dummy run counts.
          */
-       beginmerge(state, false);
+       beginmerge(state);
  
         /*
          * Execute merge by repeatedly extracting lowest tuple in heap, writing it
@@ -2645,50 +2841,28 @@ mergeonerun(Tuplesortstate *state)
          */
         while (state->memtupcount > 0)
         {
+               SortTuple       stup;
+
                 /* write the tuple to destTape */
-               priorAvail = state->availMem;
                 srcTape = state->memtuples[0].tupindex;
                 WRITETUP(state, destTape, &state->memtuples[0]);
-               /* writetup adjusted total free space, now fix per-tape space */
-               spaceFreed = state->availMem - priorAvail;
-               state->mergeavailmem[srcTape] += spaceFreed;
-               if ((tupIndex = state->mergenext[srcTape]) == 0)
-               {
-                       /* out of preloaded data on this tape, try to read more */
-                       mergepreread(state);
-                       /* if still no data, we've reached end of run on this tape */
-                       if ((tupIndex = state->mergenext[srcTape]) == 0)
-                       {
-                               /* remove the written-out tuple from the heap */
-                               tuplesort_heap_delete_top(state, false);
-                               continue;
-                       }
-               }
+
+               /* recycle the slot of the tuple we just wrote out, for the next read */
+               RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
  
                 /*
-                * pull next preread tuple from list, and replace the written-out
-                * tuple in the heap with it.
+                * pull next tuple from the tape, and replace the written-out tuple in
+                * the heap with it.
                  */
-               tup = &state->memtuples[tupIndex];
-               state->mergenext[srcTape] = tup->tupindex;
-               if (state->mergenext[srcTape] == 0)
-                       state->mergelast[srcTape] = 0;
-               tup->tupindex = srcTape;
-               tuplesort_heap_replace_top(state, tup, false);
-               /* put the now-unused memtuples entry on the freelist */
-               tup->tupindex = state->mergefreelist;
-               state->mergefreelist = tupIndex;
-               state->mergeavailslots[srcTape]++;
-       }
+               if (mergereadnext(state, srcTape, &stup))
+               {
+                       stup.tupindex = srcTape;
+                       tuplesort_heap_replace_top(state, &stup, false);
  
-       /*
-        * Reset tuple memory.  We've freed all of the tuples that we previously
-        * allocated, but AllocSetFree will have put those chunks of memory on
-        * particular free lists, bucketed by size class.  Thus, although all of
-        * that memory is free, it is effectively fragmented.  Resetting the
-        * context gets us out from under that problem.
-        */
-       MemoryContextReset(state->tuplecontext);
+               }
+               else
+                       tuplesort_heap_delete_top(state, false);
+       }
  
         /*
          * When the heap empties, we're done.  Write an end-of-run marker on the
@@ -2711,18 +2885,13 @@ mergeonerun(Tuplesortstate *state)
   * which tapes contain active input runs in mergeactive[].  Then, load
   * as many tuples as we can from each active input tape, and finally
   * fill the merge heap with the first tuple from each active tape.
- *
- * finalMergeBatch indicates if this is the beginning of a final on-the-fly
- * merge where a batched allocation of tuple memory is required.
   */
  static void
-beginmerge(Tuplesortstate *state, bool finalMergeBatch)
+beginmerge(Tuplesortstate *state)
  {
         int                     activeTapes;
         int                     tapenum;
         int                     srcTape;
-       int                     slotsPerTape;
-       int64           spacePerTape;
  
         /* Heap should be empty here */
         Assert(state->memtupcount == 0);
@@ -2744,519 +2913,44 @@ beginmerge(Tuplesortstate *state, bool finalMergeBatch)
                         activeTapes++;
                 }
         }
-       state->activeTapes = activeTapes;
-
-       /* Clear merge-pass state variables */
-       memset(state->mergenext, 0,
-                  state->maxTapes * sizeof(*state->mergenext));
-       memset(state->mergelast, 0,
-                  state->maxTapes * sizeof(*state->mergelast));
-       state->mergefreelist = 0;       /* nothing in the freelist */
-       state->mergefirstfree = activeTapes;            /* 1st slot avail for preread */
-
-       if (finalMergeBatch)
-       {
-               /* Free outright buffers for tape never actually allocated */
-               FREEMEM(state, (state->maxTapes - activeTapes) * TAPE_BUFFER_OVERHEAD);
-
-               /*
-                * Grow memtuples one last time, since the palloc() overhead no longer
-                * incurred can make a big difference
-                */
-               batchmemtuples(state);
-       }
-
-       /*
-        * Initialize space allocation to let each active input tape have an equal
-        * share of preread space.
-        */
         Assert(activeTapes > 0);
-       slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
-       Assert(slotsPerTape > 0);
-       spacePerTape = MAXALIGN_DOWN(state->availMem / activeTapes);
-       for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
-       {
-               if (state->mergeactive[srcTape])
-               {
-                       state->mergeavailslots[srcTape] = slotsPerTape;
-                       state->mergeavailmem[srcTape] = spacePerTape;
-               }
-       }
-
-       /*
-        * Preallocate tuple batch memory for each tape.  This is the memory used
-        * for tuples themselves (not SortTuples), so it's never used by
-        * pass-by-value datum sorts.  Memory allocation is performed here at most
-        * once per sort, just in advance of the final on-the-fly merge step.
-        */
-       if (finalMergeBatch)
-               mergebatch(state, spacePerTape);
-
-       /*
-        * Preread as many tuples as possible (and at least one) from each active
-        * tape
-        */
-       mergepreread(state);
+       state->activeTapes = activeTapes;
  
         /* Load the merge heap with the first tuple from each input tape */
         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
         {
-               int                     tupIndex = state->mergenext[srcTape];
-               SortTuple  *tup;
+               SortTuple       tup;
  
-               if (tupIndex)
+               if (mergereadnext(state, srcTape, &tup))
                 {
-                       tup = &state->memtuples[tupIndex];
-                       state->mergenext[srcTape] = tup->tupindex;
-                       if (state->mergenext[srcTape] == 0)
-                               state->mergelast[srcTape] = 0;
-                       tup->tupindex = srcTape;
-                       tuplesort_heap_insert(state, tup, false);
-                       /* put the now-unused memtuples entry on the freelist */
-                       tup->tupindex = state->mergefreelist;
-                       state->mergefreelist = tupIndex;
-                       state->mergeavailslots[srcTape]++;
-
-#ifdef TRACE_SORT
-                       if (trace_sort && finalMergeBatch)
-                       {
-                               int64           perTapeKB = (spacePerTape + 1023) / 1024;
-                               int64           usedSpaceKB;
-                               int                     usedSlots;
-
-                               /*
-                                * Report how effective batchmemtuples() was in balancing the
-                                * number of slots against the need for memory for the
-                                * underlying tuples (e.g. IndexTuples).  The big preread of
-                                * all tapes when switching to FINALMERGE state should be
-                                * fairly representative of memory utilization during the
-                                * final merge step, and in any case is the only point at
-                                * which all tapes are guaranteed to have depleted either
-                                * their batch memory allowance or slot allowance.  Ideally,
-                                * both will be completely depleted for every tape by now.
-                                */
-                               usedSpaceKB = (state->mergecurrent[srcTape] -
-                                                          state->mergetuples[srcTape] + 1023) / 1024;
-                               usedSlots = slotsPerTape - state->mergeavailslots[srcTape];
-
-                               elog(LOG, "tape %d initially used " INT64_FORMAT " KB of "
-                                        INT64_FORMAT " KB batch (%2.3f) and %d out of %d slots "
-                                        "(%2.3f)", srcTape,
-                                        usedSpaceKB, perTapeKB,
-                                        (double) usedSpaceKB / (double) perTapeKB,
-                                        usedSlots, slotsPerTape,
-                                        (double) usedSlots / (double) slotsPerTape);
-                       }
-#endif
+                       tup.tupindex = srcTape;
+                       tuplesort_heap_insert(state, &tup, false);
                 }
         }
  }
  
  /*
- * batchmemtuples - grow memtuples without palloc overhead
- *
- * When called, availMem should be approximately the amount of memory we'd
- * require to allocate memtupsize - memtupcount tuples (not SortTuples/slots)
- * that were allocated with palloc() overhead, and in doing so use up all
- * allocated slots.  However, though slots and tuple memory is in balance
- * following the last grow_memtuples() call, that's predicated on the observed
- * average tuple size for the "final" grow_memtuples() call, which includes
- * palloc overhead.  During the final merge pass, where we will arrange to
- * squeeze out the palloc overhead, we might need more slots in the memtuples
- * array.
- *
- * To make that happen, arrange for the amount of remaining memory to be
- * exactly equal to the palloc overhead multiplied by the current size of
- * the memtuples array, force the grow_memtuples flag back to true (it's
- * probably but not necessarily false on entry to this routine), and then
- * call grow_memtuples.  This simulates loading enough tuples to fill the
- * whole memtuples array and then having some space left over because of the
- * elided palloc overhead.  We expect that grow_memtuples() will conclude that
- * it can't double the size of the memtuples array but that it can increase
- * it by some percentage; but if it does decide to double it, that just means
- * that we've never managed to use many slots in the memtuples array, in which
- * case doubling it shouldn't hurt anything anyway.
- */
-static void
-batchmemtuples(Tuplesortstate *state)
-{
-       int64           refund;
-       int64           availMemLessRefund;
-       int                     memtupsize = state->memtupsize;
-
-       /* Caller error if we have no tapes */
-       Assert(state->activeTapes > 0);
-
-       /* For simplicity, assume no memtuples are actually currently counted */
-       Assert(state->memtupcount == 0);
-
-       /*
-        * Refund STANDARDCHUNKHEADERSIZE per tuple.
-        *
-        * This sometimes fails to make memory use perfectly balanced, but it
-        * should never make the situation worse.  Note that Assert-enabled builds
-        * get a larger refund, due to a varying STANDARDCHUNKHEADERSIZE.
-        */
-       refund = memtupsize * STANDARDCHUNKHEADERSIZE;
-       availMemLessRefund = state->availMem - refund;
-
-       /*
-        * We need to be sure that we do not cause LACKMEM to become true, else
-        * the batch allocation size could be calculated as negative, causing
-        * havoc.  Hence, if availMemLessRefund is negative at this point, we must
-        * do nothing.  Moreover, if it's positive but rather small, there's
-        * little point in proceeding because we could only increase memtuples by
-        * a small amount, not worth the cost of the repalloc's.  We somewhat
-        * arbitrarily set the threshold at ALLOCSET_DEFAULT_INITSIZE per tape.
-        * (Note that this does not represent any assumption about tuple sizes.)
-        */
-       if (availMemLessRefund <=
-               (int64) state->activeTapes * ALLOCSET_DEFAULT_INITSIZE)
-               return;
-
-       /*
-        * To establish balanced memory use after refunding palloc overhead,
-        * temporarily have our accounting indicate that we've allocated all
-        * memory we're allowed to less that refund, and call grow_memtuples() to
-        * have it increase the number of slots.
-        */
-       state->growmemtuples = true;
-       USEMEM(state, availMemLessRefund);
-       (void) grow_memtuples(state);
-       state->growmemtuples = false;
-       /* availMem must stay accurate for spacePerTape calculation */
-       FREEMEM(state, availMemLessRefund);
-       if (LACKMEM(state))
-               elog(ERROR, "unexpected out-of-memory situation in tuplesort");
-
-#ifdef TRACE_SORT
-       if (trace_sort)
-       {
-               Size            OldKb = (memtupsize * sizeof(SortTuple) + 1023) / 1024;
-               Size            NewKb = (state->memtupsize * sizeof(SortTuple) + 1023) / 1024;
-
-               elog(LOG, "grew memtuples %1.2fx from %d (%zu KB) to %d (%zu KB) for final merge",
-                        (double) NewKb / (double) OldKb,
-                        memtupsize, OldKb,
-                        state->memtupsize, NewKb);
-       }
-#endif
-}
-
-/*
- * mergebatch - initialize tuple memory in batch
- *
- * This allows sequential access to sorted tuples buffered in memory from
- * tapes/runs on disk during a final on-the-fly merge step.  Note that the
- * memory is not used for SortTuples, but for the underlying tuples (e.g.
- * MinimalTuples).
- *
- * Note that when batch memory is used, there is a simple division of space
- * into large buffers (one per active tape).  The conventional incremental
- * memory accounting (calling USEMEM() and FREEMEM()) is abandoned.  Instead,
- * when each tape's memory budget is exceeded, a retail palloc() "overflow" is
- * performed, which is then immediately detected in a way that is analogous to
- * LACKMEM().  This keeps each tape's use of memory fair, which is always a
- * goal.
- */
-static void
-mergebatch(Tuplesortstate *state, int64 spacePerTape)
-{
-       int                     srcTape;
-
-       Assert(state->activeTapes > 0);
-       Assert(state->tuples);
-
-       /*
-        * For the purposes of tuplesort's memory accounting, the batch allocation
-        * is special, and regular memory accounting through USEMEM() calls is
-        * abandoned (see mergeprereadone()).
-        */
-       for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
-       {
-               char       *mergetuples;
-
-               if (!state->mergeactive[srcTape])
-                       continue;
-
-               /* Allocate buffer for each active tape */
-               mergetuples = MemoryContextAllocHuge(state->tuplecontext,
-                                                                                        spacePerTape);
-
-               /* Initialize state for tape */
-               state->mergetuples[srcTape] = mergetuples;
-               state->mergecurrent[srcTape] = mergetuples;
-               state->mergetail[srcTape] = mergetuples;
-               state->mergeoverflow[srcTape] = NULL;
-       }
-
-       state->batchUsed = true;
-       state->spacePerTape = spacePerTape;
-}
-
-/*
- * mergebatchone - prepare batch memory for one merge input tape
- *
- * This is called following the exhaustion of preread tuples for one input
- * tape.  All that actually occurs is that the state for the source tape is
- * reset to indicate that all memory may be reused.
- *
- * This routine must deal with fixing up the tuple that is about to be returned
- * to the client, due to "overflow" allocations.
- */
-static void
-mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup,
-                         bool *should_free)
-{
-       Assert(state->batchUsed);
-
-       /*
-        * Tuple about to be returned to caller ("stup") is final preread tuple
-        * from tape, just removed from the top of the heap.  Special steps around
-        * memory management must be performed for that tuple, to make sure it
-        * isn't overwritten early.
-        */
-       if (!state->mergeoverflow[srcTape])
-       {
-               Size            tupLen;
-
-               /*
-                * Mark tuple buffer range for reuse, but be careful to move final,
-                * tail tuple to start of space for next run so that it's available to
-                * caller when stup is returned, and remains available at least until
-                * the next tuple is requested.
-                */
-               tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
-               state->mergecurrent[srcTape] = state->mergetuples[srcTape];
-               MOVETUP(state->mergecurrent[srcTape], state->mergetail[srcTape],
-                               tupLen);
-
-               /* Make SortTuple at top of the merge heap point to new tuple */
-               rtup->tuple = (void *) state->mergecurrent[srcTape];
-
-               state->mergetail[srcTape] = state->mergecurrent[srcTape];
-               state->mergecurrent[srcTape] += tupLen;
-       }
-       else
-       {
-               /*
-                * Handle an "overflow" retail palloc.
-                *
-                * This is needed when we run out of tuple memory for the tape.
-                */
-               state->mergecurrent[srcTape] = state->mergetuples[srcTape];
-               state->mergetail[srcTape] = state->mergetuples[srcTape];
-
-               if (rtup->tuple)
-               {
-                       Assert(rtup->tuple == (void *) state->mergeoverflow[srcTape]);
-                       /* Caller should free palloc'd tuple */
-                       *should_free = true;
-               }
-               state->mergeoverflow[srcTape] = NULL;
-       }
-}
-
-/*
- * mergebatchfreetape - handle final clean-up for batch memory once tape is
- * about to become exhausted
- *
- * All tuples are returned from tape, but a single final tuple, *rtup, is to be
- * passed back to caller.  Free tape's batch allocation buffer while ensuring
- * that the final tuple is managed appropriately.
- */
-static void
-mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup,
-                                  bool *should_free)
-{
-       Assert(state->batchUsed);
-       Assert(state->status == TSS_FINALMERGE);
-
-       /*
-        * Tuple may or may not already be an overflow allocation from
-        * mergebatchone()
-        */
-       if (!*should_free && rtup->tuple)
-       {
-               /*
-                * Final tuple still in tape's batch allocation.
-                *
-                * Return palloc()'d copy to caller, and have it freed in a similar
-                * manner to overflow allocation.  Otherwise, we'd free batch memory
-                * and pass back a pointer to garbage.  Note that we deliberately
-                * allocate this in the parent tuplesort context, to be on the safe
-                * side.
-                */
-               Size            tuplen;
-               void       *oldTuple = rtup->tuple;
-
-               tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
-               rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen);
-               MOVETUP(rtup->tuple, oldTuple, tuplen);
-               *should_free = true;
-       }
-
-       /* Free spacePerTape-sized buffer */
-       pfree(state->mergetuples[srcTape]);
-}
-
-/*
- * mergebatchalloc - allocate memory for one tuple using a batch memory
- * "logical allocation".
- *
- * This is used for the final on-the-fly merge phase only.  READTUP() routines
- * receive memory from here in place of palloc() and USEMEM() calls.
- *
- * Tuple tapenum is passed, ensuring each tape's tuples are stored in sorted,
- * contiguous order (while allowing safe reuse of memory made available to
- * each tape).  This maximizes locality of access as tuples are returned by
- * final merge.
+ * mergereadnext - read next tuple from one merge input tape
   *
- * Caller must not subsequently attempt to free memory returned here.  In
- * general, only mergebatch* functions know about how memory returned from
- * here should be freed, and this function's caller must ensure that batch
- * memory management code will definitely have the opportunity to do the right
- * thing during the final on-the-fly merge.
+ * Returns false on EOF.
   */
-static void *
-mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen)
-{
-       Size            reserve_tuplen = MAXALIGN(tuplen);
-       char       *ret;
-
-       /* Should overflow at most once before mergebatchone() call: */
-       Assert(state->mergeoverflow[tapenum] == NULL);
-       Assert(state->batchUsed);
-
-       /* It should be possible to use precisely spacePerTape memory at once */
-       if (state->mergecurrent[tapenum] + reserve_tuplen <=
-               state->mergetuples[tapenum] + state->spacePerTape)
-       {
-               /*
-                * Usual case -- caller is returned pointer into its tape's buffer,
-                * and an offset from that point is recorded as where tape has
-                * consumed up to for current round of preloading.
-                */
-               ret = state->mergetail[tapenum] = state->mergecurrent[tapenum];
-               state->mergecurrent[tapenum] += reserve_tuplen;
-       }
-       else
-       {
-               /*
-                * Allocate memory, and record as tape's overflow allocation.  This
-                * will be detected quickly, in a similar fashion to a LACKMEM()
-                * condition, and should not happen again before a new round of
-                * preloading for caller's tape.  Note that we deliberately allocate
-                * this in the parent tuplesort context, to be on the safe side.
-                *
-                * Sometimes, this does not happen because merging runs out of slots
-                * before running out of memory.
-                */
-               ret = state->mergeoverflow[tapenum] =
-                       MemoryContextAlloc(state->sortcontext, tuplen);
-       }
-
-       return ret;
-}
-
-/*
- * mergepreread - load tuples from merge input tapes
- *
- * This routine exists to improve sequentiality of reads during a merge pass,
- * as explained in the header comments of this file.  Load tuples from each
- * active source tape until the tape's run is exhausted or it has used up
- * its fair share of available memory.  In any case, we guarantee that there
- * is at least one preread tuple available from each unexhausted input tape.
- *
- * We invoke this routine at the start of a merge pass for initial load,
- * and then whenever any tape's preread data runs out.  Note that we load
- * as much data as possible from all tapes, not just the one that ran out.
- * This is because logtape.c works best with a usage pattern that alternates
- * between reading a lot of data and writing a lot of data, so whenever we
- * are forced to read, we should fill working memory completely.
- *
- * In FINALMERGE state, we *don't* use this routine, but instead just preread
- * from the single tape that ran dry.  There's no read/write alternation in
- * that state and so no point in scanning through all the tapes to fix one.
- * (Moreover, there may be quite a lot of inactive tapes in that state, since
- * we might have had many fewer runs than tapes.  In a regular tape-to-tape
- * merge we can expect most of the tapes to be active.  Plus, only
- * FINALMERGE state has to consider memory management for a batch
- * allocation.)
- */
-static void
-mergepreread(Tuplesortstate *state)
-{
-       int                     srcTape;
-
-       for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
-               mergeprereadone(state, srcTape);
-}
-
-/*
- * mergeprereadone - load tuples from one merge input tape
- *
- * Read tuples from the specified tape until it has used up its free memory
- * or array slots; but ensure that we have at least one tuple, if any are
- * to be had.
- */
-static void
-mergeprereadone(Tuplesortstate *state, int srcTape)
+static bool
+mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
  {
         unsigned int tuplen;
-       SortTuple       stup;
-       int                     tupIndex;
-       int64           priorAvail,
-                               spaceUsed;
  
         if (!state->mergeactive[srcTape])
-               return;                                 /* tape's run is already exhausted */
-
-       /*
-        * Manage per-tape availMem.  Only actually matters when batch memory not
-        * in use.
-        */
-       priorAvail = state->availMem;
-       state->availMem = state->mergeavailmem[srcTape];
+               return false;                   /* tape's run is already exhausted */
  
-       /*
-        * When batch memory is used if final on-the-fly merge, only mergeoverflow
-        * test is relevant; otherwise, only LACKMEM() test is relevant.
-        */
-       while ((state->mergeavailslots[srcTape] > 0 &&
-                       state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
-                  state->mergenext[srcTape] == 0)
+       /* read next tuple, if any */
+       if ((tuplen = getlen(state, srcTape, true)) == 0)
         {
-               /* read next tuple, if any */
-               if ((tuplen = getlen(state, srcTape, true)) == 0)
-               {
-                       state->mergeactive[srcTape] = false;
-                       break;
-               }
-               READTUP(state, &stup, srcTape, tuplen);
-               /* find a free slot in memtuples[] for it */
-               tupIndex = state->mergefreelist;
-               if (tupIndex)
-                       state->mergefreelist = state->memtuples[tupIndex].tupindex;
-               else
-               {
-                       tupIndex = state->mergefirstfree++;
-                       Assert(tupIndex < state->memtupsize);
-               }
-               state->mergeavailslots[srcTape]--;
-               /* store tuple, append to list for its tape */
-               stup.tupindex = 0;
-               state->memtuples[tupIndex] = stup;
-               if (state->mergelast[srcTape])
-                       state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
-               else
-                       state->mergenext[srcTape] = tupIndex;
-               state->mergelast[srcTape] = tupIndex;
+               state->mergeactive[srcTape] = false;
+               return false;
         }
-       /* update per-tape and global availmem counts */
-       spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
-       state->mergeavailmem[srcTape] = state->availMem;
-       state->availMem = priorAvail - spaceUsed;
+       READTUP(state, stup, srcTape, tuplen);
+
+       return true;
  }
  
  /*
@@ -3441,9 +3135,9 @@ dumpbatch(Tuplesortstate *state, bool alltuples)
         /*
          * Reset tuple memory.  We've freed all of the tuples that we previously
          * allocated.  It's important to avoid fragmentation when there is a stark
-        * change in allocation patterns due to the use of batch memory.
-        * Fragmentation due to AllocSetFree's bucketing by size class might be
-        * particularly bad if this step wasn't taken.
+        * change in the sizes of incoming tuples.  Fragmentation due to
+        * AllocSetFree's bucketing by size class might be particularly bad if
+        * this step wasn't taken.
          */
         MemoryContextReset(state->tuplecontext);
  
@@ -3901,38 +3595,31 @@ markrunend(Tuplesortstate *state, int tapenum)
  }
  
  /*
- * Get memory for tuple from within READTUP() routine.  Allocate
- * memory and account for that, or consume from tape's batch
- * allocation.
+ * Get memory for tuple from within READTUP() routine.
   *
- * Memory returned here in the final on-the-fly merge case is recycled
- * from tape's batch allocation.  Otherwise, callers must pfree() or
- * reset tuple child memory context, and account for that with a
- * FREEMEM().  Currently, this only ever needs to happen in WRITETUP()
- * routines.
+ * We use next free slot from the slab allocator, or palloc() if the tuple
+ * is too large for that.
   */
  static void *
-readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen)
+readtup_alloc(Tuplesortstate *state, Size tuplen)
  {
-       if (state->batchUsed)
-       {
-               /*
-                * No USEMEM() call, because during final on-the-fly merge accounting
-                * is based on tape-private state. ("Overflow" allocations are
-                * detected as an indication that a new round or preloading is
-                * required. Preloading marks existing contents of tape's batch buffer
-                * for reuse.)
-                */
-               return mergebatchalloc(state, tapenum, tuplen);
-       }
+       SlabSlot   *buf;
+
+       /*
+        * We pre-allocate enough slots in the slab arena that we should never run
+        * out.
+        */
+       Assert(state->slabFreeHead);
+
+       if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
+               return MemoryContextAlloc(state->sortcontext, tuplen);
         else
         {
-               char       *ret;
+               buf = state->slabFreeHead;
+               /* Reuse this slot */
+               state->slabFreeHead = buf->nextfree;
  
-               /* Batch allocation yet to be performed */
-               ret = MemoryContextAlloc(state->tuplecontext, tuplen);
-               USEMEM(state, GetMemoryChunkSpace(ret));
-               return ret;
+               return buf;
         }
  }
  
@@ -4101,8 +3788,11 @@ writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
                 LogicalTapeWrite(state->tapeset, tapenum,
                                                  (void *) &tuplen, sizeof(tuplen));
  
-       FREEMEM(state, GetMemoryChunkSpace(tuple));
-       heap_free_minimal_tuple(tuple);
+       if (!state->slabAllocatorUsed)
+       {
+               FREEMEM(state, GetMemoryChunkSpace(tuple));
+               heap_free_minimal_tuple(tuple);
+       }
  }
  
  static void
@@ -4111,7 +3801,7 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
  {
         unsigned int tupbodylen = len - sizeof(int);
         unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
-       MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tapenum, tuplen);
+       MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
         HeapTupleData htup;
  
@@ -4132,12 +3822,6 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
                                                                 &stup->isnull1);
  }
  
-static void
-movetup_heap(void *dest, void *src, unsigned int len)
-{
-       memmove(dest, src, len);
-}
-
  /*
   * Routines specialized for the CLUSTER case (HeapTuple data, with
   * comparisons per a btree index definition)
@@ -4344,8 +4028,11 @@ writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
                 LogicalTapeWrite(state->tapeset, tapenum,
                                                  &tuplen, sizeof(tuplen));
  
-       FREEMEM(state, GetMemoryChunkSpace(tuple));
-       heap_freetuple(tuple);
+       if (!state->slabAllocatorUsed)
+       {
+               FREEMEM(state, GetMemoryChunkSpace(tuple));
+               heap_freetuple(tuple);
+       }
  }
  
  static void
@@ -4354,7 +4041,6 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup,
  {
         unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
         HeapTuple       tuple = (HeapTuple) readtup_alloc(state,
-                                                                                                 tapenum,
                                                                                                   t_len + HEAPTUPLESIZE);
  
         /* Reconstruct the HeapTupleData header */
@@ -4379,19 +4065,6 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup,
                                                                         &stup->isnull1);
  }
  
-static void
-movetup_cluster(void *dest, void *src, unsigned int len)
-{
-       HeapTuple       tuple;
-
-       memmove(dest, src, len);
-
-       /* Repoint the HeapTupleData header */
-       tuple = (HeapTuple) dest;
-       tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
-}
-
-
  /*
   * Routines specialized for IndexTuple case
   *
@@ -4659,8 +4332,11 @@ writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
                 LogicalTapeWrite(state->tapeset, tapenum,
                                                  (void *) &tuplen, sizeof(tuplen));
  
-       FREEMEM(state, GetMemoryChunkSpace(tuple));
-       pfree(tuple);
+       if (!state->slabAllocatorUsed)
+       {
+               FREEMEM(state, GetMemoryChunkSpace(tuple));
+               pfree(tuple);
+       }
  }
  
  static void
@@ -4668,7 +4344,7 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
                           int tapenum, unsigned int len)
  {
         unsigned int tuplen = len - sizeof(unsigned int);
-       IndexTuple      tuple = (IndexTuple) readtup_alloc(state, tapenum, tuplen);
+       IndexTuple      tuple = (IndexTuple) readtup_alloc(state, tuplen);
  
         LogicalTapeReadExact(state->tapeset, tapenum,
                                                  tuple, tuplen);
@@ -4683,12 +4359,6 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
                                                                  &stup->isnull1);
  }
  
-static void
-movetup_index(void *dest, void *src, unsigned int len)
-{
-       memmove(dest, src, len);
-}
-
  /*
   * Routines specialized for DatumTuple case
   */
@@ -4755,7 +4425,7 @@ writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
                 LogicalTapeWrite(state->tapeset, tapenum,
                                                  (void *) &writtenlen, sizeof(writtenlen));
  
-       if (stup->tuple)
+       if (!state->slabAllocatorUsed && stup->tuple)
         {
                 FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
                 pfree(stup->tuple);
@@ -4785,7 +4455,7 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
         }
         else
         {
-               void       *raddr = readtup_alloc(state, tapenum, tuplen);
+               void       *raddr = readtup_alloc(state, tuplen);
  
                 LogicalTapeReadExact(state->tapeset, tapenum,
                                                          raddr, tuplen);
@@ -4799,12 +4469,6 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
                                                          &tuplen, sizeof(tuplen));
  }
  
-static void
-movetup_datum(void *dest, void *src, unsigned int len)
-{
-       memmove(dest, src, len);
-}
-
  /*
   * Convenience routine to free a tuple previously loaded into sort memory
   */
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h

index fa1e9920828c529a2fdeefbd77b3483d7fdb60ef..362a6196dca3cd3b827a87f1d0f3a619a3126e08 100644 (file)
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -39,6 +39,8 @@ extern bool LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
                                 long blocknum, int offset);
  extern void LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
                                 long *blocknum, int *offset);
+extern void LogicalTapeAssignReadBufferSize(LogicalTapeSet *lts, int tapenum,
+                                                               size_t bufsize);
  extern long LogicalTapeSetBlocks(LogicalTapeSet *lts);
  
  #endif   /* LOGTAPE_H */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 3 Oct 2016 10:37:49 +0000 (13:37 +0300)
src/backend/utils/sort/logtape.c		patch \| blob \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| history
src/include/utils/logtape.h		patch \| blob \| history