granicus.if.org Git - postgresql/blob - src/backend/utils/sort/tuplesort.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * tuplesort.c
   4  *        Generalized tuple sorting routines.
   5  *
   6  * This module handles sorting of heap tuples, index tuples, or single
   7  * Datums (and could easily support other kinds of sortable objects,
   8  * if necessary).  It works efficiently for both small and large amounts
   9  * of data.  Small amounts are sorted in-memory using qsort().  Large
  10  * amounts are sorted using temporary files and a standard external sort
  11  * algorithm.
  12  *
  13  * See Knuth, volume 3, for more than you want to know about the external
  14  * sorting algorithm.  We divide the input into sorted runs using replacement
  15  * selection, in the form of a priority tree implemented as a heap
  16  * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
  17  * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
  18  * are implemented by logtape.c, which avoids space wastage by recycling
  19  * disk space as soon as each block is read from its "tape".
  20  *
  21  * We do not form the initial runs using Knuth's recommended replacement
  22  * selection data structure (Algorithm 5.4.1R), because it uses a fixed
  23  * number of records in memory at all times.  Since we are dealing with
  24  * tuples that may vary considerably in size, we want to be able to vary
  25  * the number of records kept in memory to ensure full utilization of the
  26  * allowed sort memory space.  So, we keep the tuples in a variable-size
  27  * heap, with the next record to go out at the top of the heap.  Like
  28  * Algorithm 5.4.1R, each record is stored with the run number that it
  29  * must go into, and we use (run number, key) as the ordering key for the
  30  * heap.  When the run number at the top of the heap changes, we know that
  31  * no more records of the prior run are left in the heap.
  32  *
  33  * The approximate amount of memory allowed for any one sort operation
  34  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
  35  * we absorb tuples and simply store them in an unsorted array as long as
  36  * we haven't exceeded workMem.  If we reach the end of the input without
  37  * exceeding workMem, we sort the array using qsort() and subsequently return
  38  * tuples just by scanning the tuple array sequentially.  If we do exceed
  39  * workMem, we construct a heap using Algorithm H and begin to emit tuples
  40  * into sorted runs in temporary tapes, emitting just enough tuples at each
  41  * step to get back within the workMem limit.  Whenever the run number at
  42  * the top of the heap changes, we begin a new run with a new output tape
  43  * (selected per Algorithm D).  After the end of the input is reached,
  44  * we dump out remaining tuples in memory into a final run (or two),
  45  * then merge the runs using Algorithm D.
  46  *
  47  * When merging runs, we use a heap containing just the frontmost tuple from
  48  * each source run; we repeatedly output the smallest tuple and insert the
  49  * next tuple from its source tape (if any).  When the heap empties, the merge
  50  * is complete.  The basic merge algorithm thus needs very little memory ---
  51  * only M tuples for an M-way merge, and M is constrained to a small number.
  52  * However, we can still make good use of our full workMem allocation by
  53  * pre-reading additional tuples from each source tape.  Without prereading,
  54  * our access pattern to the temporary file would be very erratic; on average
  55  * we'd read one block from each of M source tapes during the same time that
  56  * we're writing M blocks to the output tape, so there is no sequentiality of
  57  * access at all, defeating the read-ahead methods used by most Unix kernels.
  58  * Worse, the output tape gets written into a very random sequence of blocks
  59  * of the temp file, ensuring that things will be even worse when it comes
  60  * time to read that tape.  A straightforward merge pass thus ends up doing a
  61  * lot of waiting for disk seeks.  We can improve matters by prereading from
  62  * each source tape sequentially, loading about workMem/M bytes from each tape
  63  * in turn.  Then we run the merge algorithm, writing but not reading until
  64  * one of the preloaded tuple series runs out.  Then we switch back to preread
  65  * mode, fill memory again, and repeat.  This approach helps to localize both
  66  * read and write accesses.
  67  *
  68  * When the caller requests random access to the sort result, we form
  69  * the final sorted run on a logical tape which is then "frozen", so
  70  * that we can access it randomly.  When the caller does not need random
  71  * access, we return from tuplesort_performsort() as soon as we are down
  72  * to one run per logical tape.  The final merge is then performed
  73  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
  74  * saves one cycle of writing all the data out to disk and reading it in.
  75  *
  76  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
  77  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
  78  * to Knuth's figure 70 (section 5.4.2).  However, Knuth is assuming that
  79  * tape drives are expensive beasts, and in particular that there will always
  80  * be many more runs than tape drives.  In our implementation a "tape drive"
  81  * doesn't cost much more than a few Kb of memory buffers, so we can afford
  82  * to have lots of them.  In particular, if we can have as many tape drives
  83  * as sorted runs, we can eliminate any repeated I/O at all.  In the current
  84  * code we determine the number of tapes M on the basis of workMem: we want
  85  * workMem/M to be large enough that we read a fair amount of data each time
  86  * we preread from a tape, so as to maintain the locality of access described
  87  * above.  Nonetheless, with large workMem we can have many tapes.
  88  *
  89  *
  90  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  91  * Portions Copyright (c) 1994, Regents of the University of California
  92  *
  93  * IDENTIFICATION
  94  *        src/backend/utils/sort/tuplesort.c
  95  *
  96  *-------------------------------------------------------------------------
  97  */
  98
  99 #include "postgres.h"
 100
 101 #include <limits.h>
 102
 103 #include "access/htup_details.h"
 104 #include "access/nbtree.h"
 105 #include "catalog/index.h"
 106 #include "catalog/pg_am.h"
 107 #include "commands/tablespace.h"
 108 #include "executor/executor.h"
 109 #include "miscadmin.h"
 110 #include "pg_trace.h"
 111 #include "utils/datum.h"
 112 #include "utils/logtape.h"
 113 #include "utils/lsyscache.h"
 114 #include "utils/memutils.h"
 115 #include "utils/pg_rusage.h"
 116 #include "utils/rel.h"
 117 #include "utils/sortsupport.h"
 118 #include "utils/tuplesort.h"
 119
 120
 121 /* sort-type codes for sort__start probes */
 122 #define HEAP_SORT               0
 123 #define INDEX_SORT              1
 124 #define DATUM_SORT              2
 125 #define CLUSTER_SORT    3
 126
 127 /* GUC variables */
 128 #ifdef TRACE_SORT
 129 bool            trace_sort = false;
 130 #endif
 131
 132 #ifdef DEBUG_BOUNDED_SORT
 133 bool            optimize_bounded_sort = true;
 134 #endif
 135
 136
 137 /*
 138  * The objects we actually sort are SortTuple structs.  These contain
 139  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
 140  * which is a separate palloc chunk --- we assume it is just one chunk and
 141  * can be freed by a simple pfree() (except during final on-the-fly merge,
 142  * when memory is used in batch).  SortTuples also contain the tuple's
 143  * first key column in Datum/nullflag format, and an index integer.
 144  *
 145  * Storing the first key column lets us save heap_getattr or index_getattr
 146  * calls during tuple comparisons.  We could extract and save all the key
 147  * columns not just the first, but this would increase code complexity and
 148  * overhead, and wouldn't actually save any comparison cycles in the common
 149  * case where the first key determines the comparison result.  Note that
 150  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
 151  *
 152  * There is one special case: when the sort support infrastructure provides an
 153  * "abbreviated key" representation, where the key is (typically) a pass by
 154  * value proxy for a pass by reference type.  In this case, the abbreviated key
 155  * is stored in datum1 in place of the actual first key column.
 156  *
 157  * When sorting single Datums, the data value is represented directly by
 158  * datum1/isnull1 for pass by value types (or null values).  If the datatype is
 159  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
 160  * palloc'd data value, otherwise "tuple" is NULL.  The value of datum1 is then
 161  * either the same pointer as "tuple", or is an abbreviated key value as
 162  * described above.  Accordingly, "tuple" is always used in preference to
 163  * datum1 as the authoritative value for pass-by-reference cases.
 164  *
 165  * While building initial runs, tupindex holds the tuple's run number.  During
 166  * merge passes, we re-use it to hold the input tape number that each tuple in
 167  * the heap was read from, or to hold the index of the next tuple pre-read
 168  * from the same tape in the case of pre-read entries.  tupindex goes unused
 169  * if the sort occurs entirely in memory.
 170  */
 171 typedef struct
 172 {
 173         void       *tuple;                      /* the tuple proper */
 174         Datum           datum1;                 /* value of first key column */
 175         bool            isnull1;                /* is first key column NULL? */
 176         int                     tupindex;               /* see notes above */
 177 } SortTuple;
 178
 179
 180 /*
 181  * Possible states of a Tuplesort object.  These denote the states that
 182  * persist between calls of Tuplesort routines.
 183  */
 184 typedef enum
 185 {
 186         TSS_INITIAL,                            /* Loading tuples; still within memory limit */
 187         TSS_BOUNDED,                            /* Loading tuples into bounded-size heap */
 188         TSS_BUILDRUNS,                          /* Loading tuples; writing to tape */
 189         TSS_SORTEDINMEM,                        /* Sort completed entirely in memory */
 190         TSS_SORTEDONTAPE,                       /* Sort completed, final run is on tape */
 191         TSS_FINALMERGE                          /* Performing final merge on-the-fly */
 192 } TupSortStatus;
 193
 194 /*
 195  * Parameters for calculation of number of tapes to use --- see inittapes()
 196  * and tuplesort_merge_order().
 197  *
 198  * In this calculation we assume that each tape will cost us about 3 blocks
 199  * worth of buffer space (which is an underestimate for very large data
 200  * volumes, but it's probably close enough --- see logtape.c).
 201  *
 202  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
 203  * tape during a preread cycle (see discussion at top of file).
 204  */
 205 #define MINORDER                6               /* minimum merge order */
 206 #define TAPE_BUFFER_OVERHEAD            (BLCKSZ * 3)
 207 #define MERGE_BUFFER_SIZE                       (BLCKSZ * 32)
 208
 209 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 210                                                                                                 Tuplesortstate *state);
 211
 212 /*
 213  * Private state of a Tuplesort operation.
 214  */
 215 struct Tuplesortstate
 216 {
 217         TupSortStatus status;           /* enumerated value as shown above */
 218         int                     nKeys;                  /* number of columns in sort key */
 219         bool            randomAccess;   /* did caller request random access? */
 220         bool            bounded;                /* did caller specify a maximum number of
 221                                                                  * tuples to return? */
 222         bool            boundUsed;              /* true if we made use of a bounded heap */
 223         int                     bound;                  /* if bounded, the maximum number of tuples */
 224         bool            tuples;                 /* Can SortTuple.tuple ever be set? */
 225         int64           availMem;               /* remaining memory available, in bytes */
 226         int64           allowedMem;             /* total memory allowed, in bytes */
 227         int                     maxTapes;               /* number of tapes (Knuth's T) */
 228         int                     tapeRange;              /* maxTapes-1 (Knuth's P) */
 229         MemoryContext sortcontext;      /* memory context holding most sort data */
 230         MemoryContext tuplecontext;     /* sub-context of sortcontext for tuple data */
 231         LogicalTapeSet *tapeset;        /* logtape.c object for tapes in a temp file */
 232
 233         /*
 234          * These function pointers decouple the routines that must know what kind
 235          * of tuple we are sorting from the routines that don't need to know it.
 236          * They are set up by the tuplesort_begin_xxx routines.
 237          *
 238          * Function to compare two tuples; result is per qsort() convention, ie:
 239          * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 240          * qsort_arg_comparator.
 241          */
 242         SortTupleComparator comparetup;
 243
 244         /*
 245          * Function to copy a supplied input tuple into palloc'd space and set up
 246          * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 247          * state->availMem must be decreased by the amount of space used for the
 248          * tuple copy (note the SortTuple struct itself is not counted).
 249          */
 250         void            (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 251
 252         /*
 253          * Function to write a stored tuple onto tape.  The representation of the
 254          * tuple on tape need not be the same as it is in memory; requirements on
 255          * the tape representation are given below.  After writing the tuple,
 256          * pfree() the out-of-line data (not the SortTuple struct!), and increase
 257          * state->availMem by the amount of memory space thereby released.
 258          */
 259         void            (*writetup) (Tuplesortstate *state, int tapenum,
 260                                                                                  SortTuple *stup);
 261
 262         /*
 263          * Function to read a stored tuple from tape back into memory. 'len' is
 264          * the already-read length of the stored tuple.  Create a palloc'd copy,
 265          * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
 266          * decrease state->availMem by the amount of memory space consumed.
 267          */
 268         void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
 269                                                                                 int tapenum, unsigned int len);
 270
 271         /*
 272          * This array holds the tuples now in sort memory.  If we are in state
 273          * INITIAL, the tuples are in no particular order; if we are in state
 274          * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
 275          * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
 276          * H.  (Note that memtupcount only counts the tuples that are part of the
 277          * heap --- during merge passes, memtuples[] entries beyond tapeRange are
 278          * never in the heap and are used to hold pre-read tuples.)  In state
 279          * SORTEDONTAPE, the array is not used.
 280          */
 281         SortTuple  *memtuples;          /* array of SortTuple structs */
 282         int                     memtupcount;    /* number of tuples currently present */
 283         int                     memtupsize;             /* allocated length of memtuples array */
 284         bool            growmemtuples;  /* memtuples' growth still underway? */
 285
 286         /*
 287          * Memory for tuples is sometimes allocated in batch, rather than
 288          * incrementally.  This implies that incremental memory accounting has been
 289          * abandoned.  Currently, this only happens for the final on-the-fly merge
 290          * step.  Large batch allocations can store tuples (e.g. IndexTuples)
 291          * without palloc() fragmentation and other overhead.
 292          */
 293         bool            batchUsed;
 294
 295         /*
 296          * While building initial runs, this is the current output run number
 297          * (starting at 0).  Afterwards, it is the number of initial runs we made.
 298          */
 299         int                     currentRun;
 300
 301         /*
 302          * Unless otherwise noted, all pointer variables below are pointers to
 303          * arrays of length maxTapes, holding per-tape data.
 304          */
 305
 306         /*
 307          * These variables are only used during merge passes.  mergeactive[i] is
 308          * true if we are reading an input run from (actual) tape number i and
 309          * have not yet exhausted that run.  mergenext[i] is the memtuples index
 310          * of the next pre-read tuple (next to be loaded into the heap) for tape
 311          * i, or 0 if we are out of pre-read tuples.  mergelast[i] similarly
 312          * points to the last pre-read tuple from each tape.  mergeavailslots[i]
 313          * is the number of unused memtuples[] slots reserved for tape i, and
 314          * mergeavailmem[i] is the amount of unused space allocated for tape i.
 315          * mergefreelist and mergefirstfree keep track of unused locations in the
 316          * memtuples[] array.  The memtuples[].tupindex fields link together
 317          * pre-read tuples for each tape as well as recycled locations in
 318          * mergefreelist. It is OK to use 0 as a null link in these lists, because
 319          * memtuples[0] is part of the merge heap and is never a pre-read tuple.
 320          */
 321         bool       *mergeactive;        /* active input run source? */
 322         int                *mergenext;          /* first preread tuple for each source */
 323         int                *mergelast;          /* last preread tuple for each source */
 324         int                *mergeavailslots;    /* slots left for prereading each tape */
 325         int64      *mergeavailmem;      /* availMem for prereading each tape */
 326         int                     mergefreelist;  /* head of freelist of recycled slots */
 327         int                     mergefirstfree; /* first slot never used in this merge */
 328
 329         /*
 330          * Per-tape batch state, when final on-the-fly merge consumes memory from
 331          * just a few large allocations.
 332          *
 333          * Aside from the general benefits of performing fewer individual retail
 334          * palloc() calls, this also helps make merging more cache efficient, since
 335          * each tape's tuples must naturally be accessed sequentially (in sorted
 336          * order).
 337          */
 338         int64           spacePerTape;   /* Space (memory) for tuples (not slots) */
 339         char      **mergetuples;        /* Each tape's memory allocation */
 340         char      **mergecurrent;       /* Current offset into each tape's memory */
 341         char      **mergetail;          /* Last item's start point for each tape */
 342         char      **mergeoverflow;      /* Retail palloc() "overflow" for each tape */
 343
 344         /*
 345          * Variables for Algorithm D.  Note that destTape is a "logical" tape
 346          * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
 347          * "logical" and "actual" tape numbers straight!
 348          */
 349         int                     Level;                  /* Knuth's l */
 350         int                     destTape;               /* current output tape (Knuth's j, less 1) */
 351         int                *tp_fib;                     /* Target Fibonacci run counts (A[]) */
 352         int                *tp_runs;            /* # of real runs on each tape */
 353         int                *tp_dummy;           /* # of dummy runs for each tape (D[]) */
 354         int                *tp_tapenum;         /* Actual tape numbers (TAPE[]) */
 355         int                     activeTapes;    /* # of active input tapes in merge pass */
 356
 357         /*
 358          * These variables are used after completion of sorting to keep track of
 359          * the next tuple to return.  (In the tape case, the tape's current read
 360          * position is also critical state.)
 361          */
 362         int                     result_tape;    /* actual tape number of finished output */
 363         int                     current;                /* array index (only used if SORTEDINMEM) */
 364         bool            eof_reached;    /* reached EOF (needed for cursors) */
 365
 366         /* markpos_xxx holds marked position for mark and restore */
 367         long            markpos_block;  /* tape block# (only used if SORTEDONTAPE) */
 368         int                     markpos_offset; /* saved "current", or offset in tape block */
 369         bool            markpos_eof;    /* saved "eof_reached" */
 370
 371         /*
 372          * The sortKeys variable is used by every case other than the hash index
 373          * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
 374          * MinimalTuple and CLUSTER routines, though.
 375          */
 376         TupleDesc       tupDesc;
 377         SortSupport sortKeys;           /* array of length nKeys */
 378
 379         /*
 380          * This variable is shared by the single-key MinimalTuple case and the
 381          * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
 382          */
 383         SortSupport onlyKey;
 384
 385         /*
 386          * Additional state for managing "abbreviated key" sortsupport routines
 387          * (which currently may be used by all cases except the hash index case).
 388          * Tracks the intervals at which the optimization's effectiveness is
 389          * tested.
 390          */
 391         int64           abbrevNext;             /* Tuple # at which to next check
 392                                                                  * applicability */
 393
 394         /*
 395          * These variables are specific to the CLUSTER case; they are set by
 396          * tuplesort_begin_cluster.
 397          */
 398         IndexInfo  *indexInfo;          /* info about index being used for reference */
 399         EState     *estate;                     /* for evaluating index expressions */
 400
 401         /*
 402          * These variables are specific to the IndexTuple case; they are set by
 403          * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
 404          */
 405         Relation        heapRel;                /* table the index is being built on */
 406         Relation        indexRel;               /* index being built */
 407
 408         /* These are specific to the index_btree subcase: */
 409         bool            enforceUnique;  /* complain if we find duplicate tuples */
 410
 411         /* These are specific to the index_hash subcase: */
 412         uint32          hash_mask;              /* mask for sortable part of hash code */
 413
 414         /*
 415          * These variables are specific to the Datum case; they are set by
 416          * tuplesort_begin_datum and used only by the DatumTuple routines.
 417          */
 418         Oid                     datumType;
 419         /* we need typelen in order to know how to copy the Datums. */
 420         int                     datumTypeLen;
 421
 422         /*
 423          * Resource snapshot for time of sort start.
 424          */
 425 #ifdef TRACE_SORT
 426         PGRUsage        ru_start;
 427 #endif
 428 };
 429
 430 #define COMPARETUP(state,a,b)   ((*(state)->comparetup) (a, b, state))
 431 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
 432 #define WRITETUP(state,tape,stup)       ((*(state)->writetup) (state, tape, stup))
 433 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
 434 #define LACKMEM(state)          ((state)->availMem < 0 && !(state)->batchUsed)
 435 #define USEMEM(state,amt)       ((state)->availMem -= (amt))
 436 #define FREEMEM(state,amt)      ((state)->availMem += (amt))
 437
 438 /*
 439  * NOTES about on-tape representation of tuples:
 440  *
 441  * We require the first "unsigned int" of a stored tuple to be the total size
 442  * on-tape of the tuple, including itself (so it is never zero; an all-zero
 443  * unsigned int is used to delimit runs).  The remainder of the stored tuple
 444  * may or may not match the in-memory representation of the tuple ---
 445  * any conversion needed is the job of the writetup and readtup routines.
 446  *
 447  * If state->randomAccess is true, then the stored representation of the
 448  * tuple must be followed by another "unsigned int" that is a copy of the
 449  * length --- so the total tape space used is actually sizeof(unsigned int)
 450  * more than the stored length value.  This allows read-backwards.  When
 451  * randomAccess is not true, the write/read routines may omit the extra
 452  * length word.
 453  *
 454  * writetup is expected to write both length words as well as the tuple
 455  * data.  When readtup is called, the tape is positioned just after the
 456  * front length word; readtup must read the tuple data and advance past
 457  * the back length word (if present).
 458  *
 459  * The write/read routines can make use of the tuple description data
 460  * stored in the Tuplesortstate record, if needed.  They are also expected
 461  * to adjust state->availMem by the amount of memory space (not tape space!)
 462  * released or consumed.  There is no error return from either writetup
 463  * or readtup; they should ereport() on failure.
 464  *
 465  *
 466  * NOTES about memory consumption calculations:
 467  *
 468  * We count space allocated for tuples against the workMem limit, plus
 469  * the space used by the variable-size memtuples array.  Fixed-size space
 470  * is not counted; it's small enough to not be interesting.
 471  *
 472  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
 473  * rather than the originally-requested size.  This is important since
 474  * palloc can add substantial overhead.  It's not a complete answer since
 475  * we won't count any wasted space in palloc allocation blocks, but it's
 476  * a lot better than what we were doing before 7.3.  As of 9.6, a
 477  * separate memory context is used for caller passed tuples.  Resetting
 478  * it at certain key increments significantly ameliorates fragmentation.
 479  * Note that this places a responsibility on readtup and copytup routines
 480  * to use the right memory context for these tuples (and to not use the
 481  * reset context for anything whose lifetime needs to span multiple
 482  * external sort runs).
 483  */
 484
 485 /* When using this macro, beware of double evaluation of len */
 486 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
 487         do { \
 488                 if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
 489                         elog(ERROR, "unexpected end of data"); \
 490         } while(0)
 491
 492
 493 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
 494 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 495 static bool consider_abort_common(Tuplesortstate *state);
 496 static void inittapes(Tuplesortstate *state);
 497 static void selectnewtape(Tuplesortstate *state);
 498 static void mergeruns(Tuplesortstate *state);
 499 static void mergeonerun(Tuplesortstate *state);
 500 static void beginmerge(Tuplesortstate *state, bool finalMerge);
 501 static void batchmemtuples(Tuplesortstate *state);
 502 static void mergebatch(Tuplesortstate *state, int64 spacePerTape);
 503 static void mergebatchone(Tuplesortstate *state, int srcTape,
 504                                           SortTuple *stup, bool *should_free);
 505 static void mergebatchfreetape(Tuplesortstate *state, int srcTape,
 506                                                            SortTuple *rtup, bool *should_free);
 507 static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen);
 508 static void mergepreread(Tuplesortstate *state);
 509 static void mergeprereadone(Tuplesortstate *state, int srcTape);
 510 static void dumptuples(Tuplesortstate *state, bool alltuples);
 511 static void make_bounded_heap(Tuplesortstate *state);
 512 static void sort_bounded_heap(Tuplesortstate *state);
 513 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 514                                           int tupleindex, bool checkIndex);
 515 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
 516 static void reversedirection(Tuplesortstate *state);
 517 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 518 static void markrunend(Tuplesortstate *state, int tapenum);
 519 static void *readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen);
 520 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 521                                 Tuplesortstate *state);
 522 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
 523 static void writetup_heap(Tuplesortstate *state, int tapenum,
 524                           SortTuple *stup);
 525 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
 526                          int tapenum, unsigned int len);
 527 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
 528                                    Tuplesortstate *state);
 529 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
 530 static void writetup_cluster(Tuplesortstate *state, int tapenum,
 531                                  SortTuple *stup);
 532 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 533                                 int tapenum, unsigned int len);
 534 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 535                                            Tuplesortstate *state);
 536 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 537                                           Tuplesortstate *state);
 538 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
 539 static void writetup_index(Tuplesortstate *state, int tapenum,
 540                            SortTuple *stup);
 541 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
 542                           int tapenum, unsigned int len);
 543 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
 544                                  Tuplesortstate *state);
 545 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
 546 static void writetup_datum(Tuplesortstate *state, int tapenum,
 547                            SortTuple *stup);
 548 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
 549                           int tapenum, unsigned int len);
 550 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
 551
 552 /*
 553  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
 554  * any variant of SortTuples, using the appropriate comparetup function.
 555  * qsort_ssup() is specialized for the case where the comparetup function
 556  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
 557  * and Datum sorts.
 558  */
 559 #include "qsort_tuple.c"
 560
 561
 562 /*
 563  *              tuplesort_begin_xxx
 564  *
 565  * Initialize for a tuple sort operation.
 566  *
 567  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
 568  * zero or more times, then call tuplesort_performsort when all the tuples
 569  * have been supplied.  After performsort, retrieve the tuples in sorted
 570  * order by calling tuplesort_getXXX until it returns false/NULL.  (If random
 571  * access was requested, rescan, markpos, and restorepos can also be called.)
 572  * Call tuplesort_end to terminate the operation and release memory/disk space.
 573  *
 574  * Each variant of tuplesort_begin has a workMem parameter specifying the
 575  * maximum number of kilobytes of RAM to use before spilling data to disk.
 576  * (The normal value of this parameter is work_mem, but some callers use
 577  * other values.)  Each variant also has a randomAccess parameter specifying
 578  * whether the caller needs non-sequential access to the sort result.
 579  */
 580
 581 static Tuplesortstate *
 582 tuplesort_begin_common(int workMem, bool randomAccess)
 583 {
 584         Tuplesortstate *state;
 585         MemoryContext sortcontext;
 586         MemoryContext tuplecontext;
 587         MemoryContext oldcontext;
 588
 589         /*
 590          * Create a working memory context for this sort operation. All data
 591          * needed by the sort will live inside this context.
 592          */
 593         sortcontext = AllocSetContextCreate(CurrentMemoryContext,
 594                                                                                 "TupleSort main",
 595                                                                                 ALLOCSET_DEFAULT_MINSIZE,
 596                                                                                 ALLOCSET_DEFAULT_INITSIZE,
 597                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
 598
 599         /*
 600          * Caller tuple (e.g. IndexTuple) memory context.
 601          *
 602          * A dedicated child content used exclusively for caller passed tuples
 603          * eases memory management.  Resetting at key points reduces fragmentation.
 604          * Note that the memtuples array of SortTuples is allocated in the parent
 605          * context, not this context, because there is no need to free memtuples
 606          * early.
 607          */
 608         tuplecontext = AllocSetContextCreate(sortcontext,
 609                                                                                  "Caller tuples",
 610                                                                                  ALLOCSET_DEFAULT_MINSIZE,
 611                                                                                  ALLOCSET_DEFAULT_INITSIZE,
 612                                                                                  ALLOCSET_DEFAULT_MAXSIZE);
 613
 614         /*
 615          * Make the Tuplesortstate within the per-sort context.  This way, we
 616          * don't need a separate pfree() operation for it at shutdown.
 617          */
 618         oldcontext = MemoryContextSwitchTo(sortcontext);
 619
 620         state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
 621
 622 #ifdef TRACE_SORT
 623         if (trace_sort)
 624                 pg_rusage_init(&state->ru_start);
 625 #endif
 626
 627         state->status = TSS_INITIAL;
 628         state->randomAccess = randomAccess;
 629         state->bounded = false;
 630         state->tuples = true;
 631         state->boundUsed = false;
 632         state->allowedMem = workMem * (int64) 1024;
 633         state->availMem = state->allowedMem;
 634         state->sortcontext = sortcontext;
 635         state->tuplecontext = tuplecontext;
 636         state->tapeset = NULL;
 637
 638         state->memtupcount = 0;
 639
 640         /*
 641          * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
 642          * see comments in grow_memtuples().
 643          */
 644         state->memtupsize = Max(1024,
 645                                                 ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
 646
 647         state->growmemtuples = true;
 648         state->batchUsed = false;
 649         state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
 650
 651         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 652
 653         /* workMem must be large enough for the minimal memtuples array */
 654         if (LACKMEM(state))
 655                 elog(ERROR, "insufficient memory allowed for sort");
 656
 657         state->currentRun = 0;
 658
 659         /*
 660          * maxTapes, tapeRange, and Algorithm D variables will be initialized by
 661          * inittapes(), if needed
 662          */
 663
 664         state->result_tape = -1;        /* flag that result tape has not been formed */
 665
 666         MemoryContextSwitchTo(oldcontext);
 667
 668         return state;
 669 }
 670
 671 Tuplesortstate *
 672 tuplesort_begin_heap(TupleDesc tupDesc,
 673                                          int nkeys, AttrNumber *attNums,
 674                                          Oid *sortOperators, Oid *sortCollations,
 675                                          bool *nullsFirstFlags,
 676                                          int workMem, bool randomAccess)
 677 {
 678         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 679         MemoryContext oldcontext;
 680         int                     i;
 681
 682         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 683
 684         AssertArg(nkeys > 0);
 685
 686 #ifdef TRACE_SORT
 687         if (trace_sort)
 688                 elog(LOG,
 689                          "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
 690                          nkeys, workMem, randomAccess ? 't' : 'f');
 691 #endif
 692
 693         state->nKeys = nkeys;
 694
 695         TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
 696                                                                 false,  /* no unique check */
 697                                                                 nkeys,
 698                                                                 workMem,
 699                                                                 randomAccess);
 700
 701         state->comparetup = comparetup_heap;
 702         state->copytup = copytup_heap;
 703         state->writetup = writetup_heap;
 704         state->readtup = readtup_heap;
 705
 706         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
 707         state->abbrevNext = 10;
 708
 709         /* Prepare SortSupport data for each column */
 710         state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
 711
 712         for (i = 0; i < nkeys; i++)
 713         {
 714                 SortSupport sortKey = state->sortKeys + i;
 715
 716                 AssertArg(attNums[i] != 0);
 717                 AssertArg(sortOperators[i] != 0);
 718
 719                 sortKey->ssup_cxt = CurrentMemoryContext;
 720                 sortKey->ssup_collation = sortCollations[i];
 721                 sortKey->ssup_nulls_first = nullsFirstFlags[i];
 722                 sortKey->ssup_attno = attNums[i];
 723                 /* Convey if abbreviation optimization is applicable in principle */
 724                 sortKey->abbreviate = (i == 0);
 725
 726                 PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
 727         }
 728
 729         /*
 730          * The "onlyKey" optimization cannot be used with abbreviated keys, since
 731          * tie-breaker comparisons may be required.  Typically, the optimization
 732          * is only of value to pass-by-value types anyway, whereas abbreviated
 733          * keys are typically only of value to pass-by-reference types.
 734          */
 735         if (nkeys == 1 && !state->sortKeys->abbrev_converter)
 736                 state->onlyKey = state->sortKeys;
 737
 738         MemoryContextSwitchTo(oldcontext);
 739
 740         return state;
 741 }
 742
 743 Tuplesortstate *
 744 tuplesort_begin_cluster(TupleDesc tupDesc,
 745                                                 Relation indexRel,
 746                                                 int workMem, bool randomAccess)
 747 {
 748         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 749         ScanKey         indexScanKey;
 750         MemoryContext oldcontext;
 751         int                     i;
 752
 753         Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
 754
 755         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 756
 757 #ifdef TRACE_SORT
 758         if (trace_sort)
 759                 elog(LOG,
 760                          "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
 761                          RelationGetNumberOfAttributes(indexRel),
 762                          workMem, randomAccess ? 't' : 'f');
 763 #endif
 764
 765         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 766
 767         TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
 768                                                                 false,  /* no unique check */
 769                                                                 state->nKeys,
 770                                                                 workMem,
 771                                                                 randomAccess);
 772
 773         state->comparetup = comparetup_cluster;
 774         state->copytup = copytup_cluster;
 775         state->writetup = writetup_cluster;
 776         state->readtup = readtup_cluster;
 777         state->abbrevNext = 10;
 778
 779         state->indexInfo = BuildIndexInfo(indexRel);
 780
 781         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
 782
 783         indexScanKey = _bt_mkscankey_nodata(indexRel);
 784
 785         if (state->indexInfo->ii_Expressions != NULL)
 786         {
 787                 TupleTableSlot *slot;
 788                 ExprContext *econtext;
 789
 790                 /*
 791                  * We will need to use FormIndexDatum to evaluate the index
 792                  * expressions.  To do that, we need an EState, as well as a
 793                  * TupleTableSlot to put the table tuples into.  The econtext's
 794                  * scantuple has to point to that slot, too.
 795                  */
 796                 state->estate = CreateExecutorState();
 797                 slot = MakeSingleTupleTableSlot(tupDesc);
 798                 econtext = GetPerTupleExprContext(state->estate);
 799                 econtext->ecxt_scantuple = slot;
 800         }
 801
 802         /* Prepare SortSupport data for each column */
 803         state->sortKeys = (SortSupport) palloc0(state->nKeys *
 804                                                                                         sizeof(SortSupportData));
 805
 806         for (i = 0; i < state->nKeys; i++)
 807         {
 808                 SortSupport sortKey = state->sortKeys + i;
 809                 ScanKey         scanKey = indexScanKey + i;
 810                 int16           strategy;
 811
 812                 sortKey->ssup_cxt = CurrentMemoryContext;
 813                 sortKey->ssup_collation = scanKey->sk_collation;
 814                 sortKey->ssup_nulls_first =
 815                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 816                 sortKey->ssup_attno = scanKey->sk_attno;
 817                 /* Convey if abbreviation optimization is applicable in principle */
 818                 sortKey->abbreviate = (i == 0);
 819
 820                 AssertState(sortKey->ssup_attno != 0);
 821
 822                 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
 823                         BTGreaterStrategyNumber : BTLessStrategyNumber;
 824
 825                 PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 826         }
 827
 828         _bt_freeskey(indexScanKey);
 829
 830         MemoryContextSwitchTo(oldcontext);
 831
 832         return state;
 833 }
 834
 835 Tuplesortstate *
 836 tuplesort_begin_index_btree(Relation heapRel,
 837                                                         Relation indexRel,
 838                                                         bool enforceUnique,
 839                                                         int workMem, bool randomAccess)
 840 {
 841         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 842         ScanKey         indexScanKey;
 843         MemoryContext oldcontext;
 844         int                     i;
 845
 846         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 847
 848 #ifdef TRACE_SORT
 849         if (trace_sort)
 850                 elog(LOG,
 851                          "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
 852                          enforceUnique ? 't' : 'f',
 853                          workMem, randomAccess ? 't' : 'f');
 854 #endif
 855
 856         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 857
 858         TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
 859                                                                 enforceUnique,
 860                                                                 state->nKeys,
 861                                                                 workMem,
 862                                                                 randomAccess);
 863
 864         state->comparetup = comparetup_index_btree;
 865         state->copytup = copytup_index;
 866         state->writetup = writetup_index;
 867         state->readtup = readtup_index;
 868         state->abbrevNext = 10;
 869
 870         state->heapRel = heapRel;
 871         state->indexRel = indexRel;
 872         state->enforceUnique = enforceUnique;
 873
 874         indexScanKey = _bt_mkscankey_nodata(indexRel);
 875         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 876
 877         /* Prepare SortSupport data for each column */
 878         state->sortKeys = (SortSupport) palloc0(state->nKeys *
 879                                                                                         sizeof(SortSupportData));
 880
 881         for (i = 0; i < state->nKeys; i++)
 882         {
 883                 SortSupport sortKey = state->sortKeys + i;
 884                 ScanKey         scanKey = indexScanKey + i;
 885                 int16           strategy;
 886
 887                 sortKey->ssup_cxt = CurrentMemoryContext;
 888                 sortKey->ssup_collation = scanKey->sk_collation;
 889                 sortKey->ssup_nulls_first =
 890                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 891                 sortKey->ssup_attno = scanKey->sk_attno;
 892                 /* Convey if abbreviation optimization is applicable in principle */
 893                 sortKey->abbreviate = (i == 0);
 894
 895                 AssertState(sortKey->ssup_attno != 0);
 896
 897                 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
 898                         BTGreaterStrategyNumber : BTLessStrategyNumber;
 899
 900                 PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 901         }
 902
 903         _bt_freeskey(indexScanKey);
 904
 905         MemoryContextSwitchTo(oldcontext);
 906
 907         return state;
 908 }
 909
 910 Tuplesortstate *
 911 tuplesort_begin_index_hash(Relation heapRel,
 912                                                    Relation indexRel,
 913                                                    uint32 hash_mask,
 914                                                    int workMem, bool randomAccess)
 915 {
 916         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 917         MemoryContext oldcontext;
 918
 919         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 920
 921 #ifdef TRACE_SORT
 922         if (trace_sort)
 923                 elog(LOG,
 924                 "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c",
 925                          hash_mask,
 926                          workMem, randomAccess ? 't' : 'f');
 927 #endif
 928
 929         state->nKeys = 1;                       /* Only one sort column, the hash code */
 930
 931         state->comparetup = comparetup_index_hash;
 932         state->copytup = copytup_index;
 933         state->writetup = writetup_index;
 934         state->readtup = readtup_index;
 935
 936         state->heapRel = heapRel;
 937         state->indexRel = indexRel;
 938
 939         state->hash_mask = hash_mask;
 940
 941         MemoryContextSwitchTo(oldcontext);
 942
 943         return state;
 944 }
 945
 946 Tuplesortstate *
 947 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 948                                           bool nullsFirstFlag,
 949                                           int workMem, bool randomAccess)
 950 {
 951         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 952         MemoryContext oldcontext;
 953         int16           typlen;
 954         bool            typbyval;
 955
 956         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 957
 958 #ifdef TRACE_SORT
 959         if (trace_sort)
 960                 elog(LOG,
 961                          "begin datum sort: workMem = %d, randomAccess = %c",
 962                          workMem, randomAccess ? 't' : 'f');
 963 #endif
 964
 965         state->nKeys = 1;                       /* always a one-column sort */
 966
 967         TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
 968                                                                 false,  /* no unique check */
 969                                                                 1,
 970                                                                 workMem,
 971                                                                 randomAccess);
 972
 973         state->comparetup = comparetup_datum;
 974         state->copytup = copytup_datum;
 975         state->writetup = writetup_datum;
 976         state->readtup = readtup_datum;
 977         state->abbrevNext = 10;
 978
 979         state->datumType = datumType;
 980
 981         /* lookup necessary attributes of the datum type */
 982         get_typlenbyval(datumType, &typlen, &typbyval);
 983         state->datumTypeLen = typlen;
 984         state->tuples = !typbyval;
 985
 986         /* Prepare SortSupport data */
 987         state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
 988
 989         state->sortKeys->ssup_cxt = CurrentMemoryContext;
 990         state->sortKeys->ssup_collation = sortCollation;
 991         state->sortKeys->ssup_nulls_first = nullsFirstFlag;
 992
 993         /*
 994          * Abbreviation is possible here only for by-reference types.  In theory,
 995          * a pass-by-value datatype could have an abbreviated form that is cheaper
 996          * to compare.  In a tuple sort, we could support that, because we can
 997          * always extract the original datum from the tuple is needed.  Here, we
 998          * can't, because a datum sort only stores a single copy of the datum;
 999          * the "tuple" field of each sortTuple is NULL.
1000          */
1001         state->sortKeys->abbreviate = !typbyval;
1002
1003         PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1004
1005         /*
1006          * The "onlyKey" optimization cannot be used with abbreviated keys, since
1007          * tie-breaker comparisons may be required.  Typically, the optimization
1008          * is only of value to pass-by-value types anyway, whereas abbreviated
1009          * keys are typically only of value to pass-by-reference types.
1010          */
1011         if (!state->sortKeys->abbrev_converter)
1012                 state->onlyKey = state->sortKeys;
1013
1014         MemoryContextSwitchTo(oldcontext);
1015
1016         return state;
1017 }
1018
1019 /*
1020  * tuplesort_set_bound
1021  *
1022  *      Advise tuplesort that at most the first N result tuples are required.
1023  *
1024  * Must be called before inserting any tuples.  (Actually, we could allow it
1025  * as long as the sort hasn't spilled to disk, but there seems no need for
1026  * delayed calls at the moment.)
1027  *
1028  * This is a hint only. The tuplesort may still return more tuples than
1029  * requested.
1030  */
1031 void
1032 tuplesort_set_bound(Tuplesortstate *state, int64 bound)
1033 {
1034         /* Assert we're called before loading any tuples */
1035         Assert(state->status == TSS_INITIAL);
1036         Assert(state->memtupcount == 0);
1037         Assert(!state->bounded);
1038
1039 #ifdef DEBUG_BOUNDED_SORT
1040         /* Honor GUC setting that disables the feature (for easy testing) */
1041         if (!optimize_bounded_sort)
1042                 return;
1043 #endif
1044
1045         /* We want to be able to compute bound * 2, so limit the setting */
1046         if (bound > (int64) (INT_MAX / 2))
1047                 return;
1048
1049         state->bounded = true;
1050         state->bound = (int) bound;
1051
1052         /*
1053          * Bounded sorts are not an effective target for abbreviated key
1054          * optimization.  Disable by setting state to be consistent with no
1055          * abbreviation support.
1056          */
1057         state->sortKeys->abbrev_converter = NULL;
1058         if (state->sortKeys->abbrev_full_comparator)
1059                 state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
1060
1061         /* Not strictly necessary, but be tidy */
1062         state->sortKeys->abbrev_abort = NULL;
1063         state->sortKeys->abbrev_full_comparator = NULL;
1064 }
1065
1066 /*
1067  * tuplesort_end
1068  *
1069  *      Release resources and clean up.
1070  *
1071  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1072  * pointing to garbage.  Be careful not to attempt to use or free such
1073  * pointers afterwards!
1074  */
1075 void
1076 tuplesort_end(Tuplesortstate *state)
1077 {
1078         /* context swap probably not needed, but let's be safe */
1079         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1080
1081 #ifdef TRACE_SORT
1082         long            spaceUsed;
1083
1084         if (state->tapeset)
1085                 spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1086         else
1087                 spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1088 #endif
1089
1090         /*
1091          * Delete temporary "tape" files, if any.
1092          *
1093          * Note: want to include this in reported total cost of sort, hence need
1094          * for two #ifdef TRACE_SORT sections.
1095          */
1096         if (state->tapeset)
1097                 LogicalTapeSetClose(state->tapeset);
1098
1099 #ifdef TRACE_SORT
1100         if (trace_sort)
1101         {
1102                 if (state->tapeset)
1103                         elog(LOG, "external sort ended, %ld disk blocks used: %s",
1104                                  spaceUsed, pg_rusage_show(&state->ru_start));
1105                 else
1106                         elog(LOG, "internal sort ended, %ld KB used: %s",
1107                                  spaceUsed, pg_rusage_show(&state->ru_start));
1108         }
1109
1110         TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1111 #else
1112
1113         /*
1114          * If you disabled TRACE_SORT, you can still probe sort__done, but you
1115          * ain't getting space-used stats.
1116          */
1117         TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1118 #endif
1119
1120         /* Free any execution state created for CLUSTER case */
1121         if (state->estate != NULL)
1122         {
1123                 ExprContext *econtext = GetPerTupleExprContext(state->estate);
1124
1125                 ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
1126                 FreeExecutorState(state->estate);
1127         }
1128
1129         MemoryContextSwitchTo(oldcontext);
1130
1131         /*
1132          * Free the per-sort memory context, thereby releasing all working memory,
1133          * including the Tuplesortstate struct itself.
1134          */
1135         MemoryContextDelete(state->sortcontext);
1136 }
1137
1138 /*
1139  * Grow the memtuples[] array, if possible within our memory constraint.  We
1140  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1141  * limit.  Return TRUE if we were able to enlarge the array, FALSE if not.
1142  *
1143  * Normally, at each increment we double the size of the array.  When doing
1144  * that would exceed a limit, we attempt one last, smaller increase (and then
1145  * clear the growmemtuples flag so we don't try any more).  That allows us to
1146  * use memory as fully as permitted; sticking to the pure doubling rule could
1147  * result in almost half going unused.  Because availMem moves around with
1148  * tuple addition/removal, we need some rule to prevent making repeated small
1149  * increases in memtupsize, which would just be useless thrashing.  The
1150  * growmemtuples flag accomplishes that and also prevents useless
1151  * recalculations in this function.
1152  */
1153 static bool
1154 grow_memtuples(Tuplesortstate *state)
1155 {
1156         int                     newmemtupsize;
1157         int                     memtupsize = state->memtupsize;
1158         int64           memNowUsed = state->allowedMem - state->availMem;
1159
1160         /* Forget it if we've already maxed out memtuples, per comment above */
1161         if (!state->growmemtuples)
1162                 return false;
1163
1164         /* Select new value of memtupsize */
1165         if (memNowUsed <= state->availMem)
1166         {
1167                 /*
1168                  * We've used no more than half of allowedMem; double our usage,
1169                  * clamping at INT_MAX tuples.
1170                  */
1171                 if (memtupsize < INT_MAX / 2)
1172                         newmemtupsize = memtupsize * 2;
1173                 else
1174                 {
1175                         newmemtupsize = INT_MAX;
1176                         state->growmemtuples = false;
1177                 }
1178         }
1179         else
1180         {
1181                 /*
1182                  * This will be the last increment of memtupsize.  Abandon doubling
1183                  * strategy and instead increase as much as we safely can.
1184                  *
1185                  * To stay within allowedMem, we can't increase memtupsize by more
1186                  * than availMem / sizeof(SortTuple) elements.  In practice, we want
1187                  * to increase it by considerably less, because we need to leave some
1188                  * space for the tuples to which the new array slots will refer.  We
1189                  * assume the new tuples will be about the same size as the tuples
1190                  * we've already seen, and thus we can extrapolate from the space
1191                  * consumption so far to estimate an appropriate new size for the
1192                  * memtuples array.  The optimal value might be higher or lower than
1193                  * this estimate, but it's hard to know that in advance.  We again
1194                  * clamp at INT_MAX tuples.
1195                  *
1196                  * This calculation is safe against enlarging the array so much that
1197                  * LACKMEM becomes true, because the memory currently used includes
1198                  * the present array; thus, there would be enough allowedMem for the
1199                  * new array elements even if no other memory were currently used.
1200                  *
1201                  * We do the arithmetic in float8, because otherwise the product of
1202                  * memtupsize and allowedMem could overflow.  Any inaccuracy in the
1203                  * result should be insignificant; but even if we computed a
1204                  * completely insane result, the checks below will prevent anything
1205                  * really bad from happening.
1206                  */
1207                 double          grow_ratio;
1208
1209                 grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1210                 if (memtupsize * grow_ratio < INT_MAX)
1211                         newmemtupsize = (int) (memtupsize * grow_ratio);
1212                 else
1213                         newmemtupsize = INT_MAX;
1214
1215                 /* We won't make any further enlargement attempts */
1216                 state->growmemtuples = false;
1217         }
1218
1219         /* Must enlarge array by at least one element, else report failure */
1220         if (newmemtupsize <= memtupsize)
1221                 goto noalloc;
1222
1223         /*
1224          * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize.  Clamp
1225          * to ensure our request won't be rejected.  Note that we can easily
1226          * exhaust address space before facing this outcome.  (This is presently
1227          * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1228          * don't rely on that at this distance.)
1229          */
1230         if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1231         {
1232                 newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1233                 state->growmemtuples = false;   /* can't grow any more */
1234         }
1235
1236         /*
1237          * We need to be sure that we do not cause LACKMEM to become true, else
1238          * the space management algorithm will go nuts.  The code above should
1239          * never generate a dangerous request, but to be safe, check explicitly
1240          * that the array growth fits within availMem.  (We could still cause
1241          * LACKMEM if the memory chunk overhead associated with the memtuples
1242          * array were to increase.  That shouldn't happen because we chose the
1243          * initial array size large enough to ensure that palloc will be treating
1244          * both old and new arrays as separate chunks.  But we'll check LACKMEM
1245          * explicitly below just in case.)
1246          */
1247         if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1248                 goto noalloc;
1249
1250         /* OK, do it */
1251         FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1252         state->memtupsize = newmemtupsize;
1253         state->memtuples = (SortTuple *)
1254                 repalloc_huge(state->memtuples,
1255                                           state->memtupsize * sizeof(SortTuple));
1256         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1257         if (LACKMEM(state))
1258                 elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1259         return true;
1260
1261 noalloc:
1262         /* If for any reason we didn't realloc, shut off future attempts */
1263         state->growmemtuples = false;
1264         return false;
1265 }
1266
1267 /*
1268  * Accept one tuple while collecting input data for sort.
1269  *
1270  * Note that the input data is always copied; the caller need not save it.
1271  */
1272 void
1273 tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
1274 {
1275         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1276         SortTuple       stup;
1277
1278         /*
1279          * Copy the given tuple into memory we control, and decrease availMem.
1280          * Then call the common code.
1281          */
1282         COPYTUP(state, &stup, (void *) slot);
1283
1284         puttuple_common(state, &stup);
1285
1286         MemoryContextSwitchTo(oldcontext);
1287 }
1288
1289 /*
1290  * Accept one tuple while collecting input data for sort.
1291  *
1292  * Note that the input data is always copied; the caller need not save it.
1293  */
1294 void
1295 tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
1296 {
1297         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1298         SortTuple       stup;
1299
1300         /*
1301          * Copy the given tuple into memory we control, and decrease availMem.
1302          * Then call the common code.
1303          */
1304         COPYTUP(state, &stup, (void *) tup);
1305
1306         puttuple_common(state, &stup);
1307
1308         MemoryContextSwitchTo(oldcontext);
1309 }
1310
1311 /*
1312  * Collect one index tuple while collecting input data for sort, building
1313  * it from caller-supplied values.
1314  */
1315 void
1316 tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
1317                                                           ItemPointer self, Datum *values,
1318                                                           bool *isnull)
1319 {
1320         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1321         SortTuple       stup;
1322         Datum           original;
1323         IndexTuple      tuple;
1324
1325         stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1326         tuple = ((IndexTuple) stup.tuple);
1327         tuple->t_tid = *self;
1328         USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1329         /* set up first-column key value */
1330         original = index_getattr(tuple,
1331                                                          1,
1332                                                          RelationGetDescr(state->indexRel),
1333                                                          &stup.isnull1);
1334
1335         MemoryContextSwitchTo(state->sortcontext);
1336
1337         if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1338         {
1339                 /*
1340                  * Store ordinary Datum representation, or NULL value.  If there is a
1341                  * converter it won't expect NULL values, and cost model is not
1342                  * required to account for NULL, so in that case we avoid calling
1343                  * converter and just set datum1 to zeroed representation (to be
1344                  * consistent, and to support cheap inequality tests for NULL
1345                  * abbreviated keys).
1346                  */
1347                 stup.datum1 = original;
1348         }
1349         else if (!consider_abort_common(state))
1350         {
1351                 /* Store abbreviated key representation */
1352                 stup.datum1 = state->sortKeys->abbrev_converter(original,
1353                                                                                                                 state->sortKeys);
1354         }
1355         else
1356         {
1357                 /* Abort abbreviation */
1358                 int                     i;
1359
1360                 stup.datum1 = original;
1361
1362                 /*
1363                  * Set state to be consistent with never trying abbreviation.
1364                  *
1365                  * Alter datum1 representation in already-copied tuples, so as to
1366                  * ensure a consistent representation (current tuple was just
1367                  * handled).  It does not matter if some dumped tuples are already
1368                  * sorted on tape, since serialized tuples lack abbreviated keys
1369                  * (TSS_BUILDRUNS state prevents control reaching here in any
1370                  * case).
1371                  */
1372                 for (i = 0; i < state->memtupcount; i++)
1373                 {
1374                         SortTuple  *mtup = &state->memtuples[i];
1375
1376                         tuple = mtup->tuple;
1377                         mtup->datum1 = index_getattr(tuple,
1378                                                                                  1,
1379                                                                                  RelationGetDescr(state->indexRel),
1380                                                                                  &stup.isnull1);
1381                 }
1382         }
1383
1384         puttuple_common(state, &stup);
1385
1386         MemoryContextSwitchTo(oldcontext);
1387 }
1388
1389 /*
1390  * Accept one Datum while collecting input data for sort.
1391  *
1392  * If the Datum is pass-by-ref type, the value will be copied.
1393  */
1394 void
1395 tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
1396 {
1397         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1398         SortTuple       stup;
1399
1400         /*
1401          * Pass-by-value types or null values are just stored directly in
1402          * stup.datum1 (and stup.tuple is not used and set to NULL).
1403          *
1404          * Non-null pass-by-reference values need to be copied into memory we
1405          * control, and possibly abbreviated. The copied value is pointed to by
1406          * stup.tuple and is treated as the canonical copy (e.g. to return via
1407          * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1408          * abbreviated value if abbreviation is happening, otherwise it's
1409          * identical to stup.tuple.
1410          */
1411
1412         if (isNull || !state->tuples)
1413         {
1414                 /*
1415                  * Set datum1 to zeroed representation for NULLs (to be consistent, and
1416                  * to support cheap inequality tests for NULL abbreviated keys).
1417                  */
1418                 stup.datum1 = !isNull ? val : (Datum) 0;
1419                 stup.isnull1 = isNull;
1420                 stup.tuple = NULL;              /* no separate storage */
1421                 MemoryContextSwitchTo(state->sortcontext);
1422         }
1423         else
1424         {
1425                 Datum           original = datumCopy(val, false, state->datumTypeLen);
1426
1427                 stup.isnull1 = false;
1428                 stup.tuple = DatumGetPointer(original);
1429                 USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1430                 MemoryContextSwitchTo(state->sortcontext);
1431
1432                 if (!state->sortKeys->abbrev_converter)
1433                 {
1434                         stup.datum1 = original;
1435                 }
1436                 else if (!consider_abort_common(state))
1437                 {
1438                         /* Store abbreviated key representation */
1439                         stup.datum1 = state->sortKeys->abbrev_converter(original,
1440                                                                                                                         state->sortKeys);
1441                 }
1442                 else
1443                 {
1444                         /* Abort abbreviation */
1445                         int                     i;
1446
1447                         stup.datum1 = original;
1448
1449                         /*
1450                          * Set state to be consistent with never trying abbreviation.
1451                          *
1452                          * Alter datum1 representation in already-copied tuples, so as to
1453                          * ensure a consistent representation (current tuple was just
1454                          * handled).  It does not matter if some dumped tuples are
1455                          * already sorted on tape, since serialized tuples lack
1456                          * abbreviated keys (TSS_BUILDRUNS state prevents control
1457                          * reaching here in any case).
1458                          */
1459                         for (i = 0; i < state->memtupcount; i++)
1460                         {
1461                                 SortTuple  *mtup = &state->memtuples[i];
1462
1463                                 mtup->datum1 = PointerGetDatum(mtup->tuple);
1464                         }
1465                 }
1466         }
1467
1468         puttuple_common(state, &stup);
1469
1470         MemoryContextSwitchTo(oldcontext);
1471 }
1472
1473 /*
1474  * Shared code for tuple and datum cases.
1475  */
1476 static void
1477 puttuple_common(Tuplesortstate *state, SortTuple *tuple)
1478 {
1479         switch (state->status)
1480         {
1481                 case TSS_INITIAL:
1482
1483                         /*
1484                          * Save the tuple into the unsorted array.  First, grow the array
1485                          * as needed.  Note that we try to grow the array when there is
1486                          * still one free slot remaining --- if we fail, there'll still be
1487                          * room to store the incoming tuple, and then we'll switch to
1488                          * tape-based operation.
1489                          */
1490                         if (state->memtupcount >= state->memtupsize - 1)
1491                         {
1492                                 (void) grow_memtuples(state);
1493                                 Assert(state->memtupcount < state->memtupsize);
1494                         }
1495                         state->memtuples[state->memtupcount++] = *tuple;
1496
1497                         /*
1498                          * Check if it's time to switch over to a bounded heapsort. We do
1499                          * so if the input tuple count exceeds twice the desired tuple
1500                          * count (this is a heuristic for where heapsort becomes cheaper
1501                          * than a quicksort), or if we've just filled workMem and have
1502                          * enough tuples to meet the bound.
1503                          *
1504                          * Note that once we enter TSS_BOUNDED state we will always try to
1505                          * complete the sort that way.  In the worst case, if later input
1506                          * tuples are larger than earlier ones, this might cause us to
1507                          * exceed workMem significantly.
1508                          */
1509                         if (state->bounded &&
1510                                 (state->memtupcount > state->bound * 2 ||
1511                                  (state->memtupcount > state->bound && LACKMEM(state))))
1512                         {
1513 #ifdef TRACE_SORT
1514                                 if (trace_sort)
1515                                         elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1516                                                  state->memtupcount,
1517                                                  pg_rusage_show(&state->ru_start));
1518 #endif
1519                                 make_bounded_heap(state);
1520                                 return;
1521                         }
1522
1523                         /*
1524                          * Done if we still fit in available memory and have array slots.
1525                          */
1526                         if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1527                                 return;
1528
1529                         /*
1530                          * Nope; time to switch to tape-based operation.
1531                          */
1532                         inittapes(state);
1533
1534                         /*
1535                          * Dump tuples until we are back under the limit.
1536                          */
1537                         dumptuples(state, false);
1538                         break;
1539
1540                 case TSS_BOUNDED:
1541
1542                         /*
1543                          * We don't want to grow the array here, so check whether the new
1544                          * tuple can be discarded before putting it in.  This should be a
1545                          * good speed optimization, too, since when there are many more
1546                          * input tuples than the bound, most input tuples can be discarded
1547                          * with just this one comparison.  Note that because we currently
1548                          * have the sort direction reversed, we must check for <= not >=.
1549                          */
1550                         if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1551                         {
1552                                 /* new tuple <= top of the heap, so we can discard it */
1553                                 free_sort_tuple(state, tuple);
1554                                 CHECK_FOR_INTERRUPTS();
1555                         }
1556                         else
1557                         {
1558                                 /* discard top of heap, sift up, insert new tuple */
1559                                 free_sort_tuple(state, &state->memtuples[0]);
1560                                 tuplesort_heap_siftup(state, false);
1561                                 tuplesort_heap_insert(state, tuple, 0, false);
1562                         }
1563                         break;
1564
1565                 case TSS_BUILDRUNS:
1566
1567                         /*
1568                          * Insert the tuple into the heap, with run number currentRun if
1569                          * it can go into the current run, else run number currentRun+1.
1570                          * The tuple can go into the current run if it is >= the first
1571                          * not-yet-output tuple.  (Actually, it could go into the current
1572                          * run if it is >= the most recently output tuple ... but that
1573                          * would require keeping around the tuple we last output, and it's
1574                          * simplest to let writetup free each tuple as soon as it's
1575                          * written.)
1576                          *
1577                          * Note there will always be at least one tuple in the heap at
1578                          * this point; see dumptuples.
1579                          */
1580                         Assert(state->memtupcount > 0);
1581                         if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
1582                                 tuplesort_heap_insert(state, tuple, state->currentRun, true);
1583                         else
1584                                 tuplesort_heap_insert(state, tuple, state->currentRun + 1, true);
1585
1586                         /*
1587                          * If we are over the memory limit, dump tuples till we're under.
1588                          */
1589                         dumptuples(state, false);
1590                         break;
1591
1592                 default:
1593                         elog(ERROR, "invalid tuplesort state");
1594                         break;
1595         }
1596 }
1597
1598 static bool
1599 consider_abort_common(Tuplesortstate *state)
1600 {
1601         Assert(state->sortKeys[0].abbrev_converter != NULL);
1602         Assert(state->sortKeys[0].abbrev_abort != NULL);
1603         Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1604
1605         /*
1606          * Check effectiveness of abbreviation optimization.  Consider aborting
1607          * when still within memory limit.
1608          */
1609         if (state->status == TSS_INITIAL &&
1610                 state->memtupcount >= state->abbrevNext)
1611         {
1612                 state->abbrevNext *= 2;
1613
1614                 /*
1615                  * Check opclass-supplied abbreviation abort routine.  It may indicate
1616                  * that abbreviation should not proceed.
1617                  */
1618                 if (!state->sortKeys->abbrev_abort(state->memtupcount,
1619                                                                                    state->sortKeys))
1620                         return false;
1621
1622                 /*
1623                  * Finally, restore authoritative comparator, and indicate that
1624                  * abbreviation is not in play by setting abbrev_converter to NULL
1625                  */
1626                 state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
1627                 state->sortKeys[0].abbrev_converter = NULL;
1628                 /* Not strictly necessary, but be tidy */
1629                 state->sortKeys[0].abbrev_abort = NULL;
1630                 state->sortKeys[0].abbrev_full_comparator = NULL;
1631
1632                 /* Give up - expect original pass-by-value representation */
1633                 return true;
1634         }
1635
1636         return false;
1637 }
1638
1639 /*
1640  * All tuples have been provided; finish the sort.
1641  */
1642 void
1643 tuplesort_performsort(Tuplesortstate *state)
1644 {
1645         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1646
1647 #ifdef TRACE_SORT
1648         if (trace_sort)
1649                 elog(LOG, "performsort starting: %s",
1650                          pg_rusage_show(&state->ru_start));
1651 #endif
1652
1653         switch (state->status)
1654         {
1655                 case TSS_INITIAL:
1656
1657                         /*
1658                          * We were able to accumulate all the tuples within the allowed
1659                          * amount of memory.  Just qsort 'em and we're done.
1660                          */
1661                         if (state->memtupcount > 1)
1662                         {
1663                                 /* Can we use the single-key sort function? */
1664                                 if (state->onlyKey != NULL)
1665                                         qsort_ssup(state->memtuples, state->memtupcount,
1666                                                            state->onlyKey);
1667                                 else
1668                                         qsort_tuple(state->memtuples,
1669                                                                 state->memtupcount,
1670                                                                 state->comparetup,
1671                                                                 state);
1672                         }
1673                         state->current = 0;
1674                         state->eof_reached = false;
1675                         state->markpos_offset = 0;
1676                         state->markpos_eof = false;
1677                         state->status = TSS_SORTEDINMEM;
1678                         break;
1679
1680                 case TSS_BOUNDED:
1681
1682                         /*
1683                          * We were able to accumulate all the tuples required for output
1684                          * in memory, using a heap to eliminate excess tuples.  Now we
1685                          * have to transform the heap to a properly-sorted array.
1686                          */
1687                         sort_bounded_heap(state);
1688                         state->current = 0;
1689                         state->eof_reached = false;
1690                         state->markpos_offset = 0;
1691                         state->markpos_eof = false;
1692                         state->status = TSS_SORTEDINMEM;
1693                         break;
1694
1695                 case TSS_BUILDRUNS:
1696
1697                         /*
1698                          * Finish tape-based sort.  First, flush all tuples remaining in
1699                          * memory out to tape; then merge until we have a single remaining
1700                          * run (or, if !randomAccess, one run per tape). Note that
1701                          * mergeruns sets the correct state->status.
1702                          */
1703                         dumptuples(state, true);
1704                         mergeruns(state);
1705                         state->eof_reached = false;
1706                         state->markpos_block = 0L;
1707                         state->markpos_offset = 0;
1708                         state->markpos_eof = false;
1709                         break;
1710
1711                 default:
1712                         elog(ERROR, "invalid tuplesort state");
1713                         break;
1714         }
1715
1716 #ifdef TRACE_SORT
1717         if (trace_sort)
1718         {
1719                 if (state->status == TSS_FINALMERGE)
1720                         elog(LOG, "performsort done (except %d-way final merge): %s",
1721                                  state->activeTapes,
1722                                  pg_rusage_show(&state->ru_start));
1723                 else
1724                         elog(LOG, "performsort done: %s",
1725                                  pg_rusage_show(&state->ru_start));
1726         }
1727 #endif
1728
1729         MemoryContextSwitchTo(oldcontext);
1730 }
1731
1732 /*
1733  * Internal routine to fetch the next tuple in either forward or back
1734  * direction into *stup.  Returns FALSE if no more tuples.
1735  * If *should_free is set, the caller must pfree stup.tuple when done with it.
1736  * Otherwise, caller should not use tuple following next call here.
1737  */
1738 static bool
1739 tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
1740                                                   SortTuple *stup, bool *should_free)
1741 {
1742         unsigned int tuplen;
1743
1744         switch (state->status)
1745         {
1746                 case TSS_SORTEDINMEM:
1747                         Assert(forward || state->randomAccess);
1748                         Assert(!state->batchUsed);
1749                         *should_free = false;
1750                         if (forward)
1751                         {
1752                                 if (state->current < state->memtupcount)
1753                                 {
1754                                         *stup = state->memtuples[state->current++];
1755                                         return true;
1756                                 }
1757                                 state->eof_reached = true;
1758
1759                                 /*
1760                                  * Complain if caller tries to retrieve more tuples than
1761                                  * originally asked for in a bounded sort.  This is because
1762                                  * returning EOF here might be the wrong thing.
1763                                  */
1764                                 if (state->bounded && state->current >= state->bound)
1765                                         elog(ERROR, "retrieved too many tuples in a bounded sort");
1766
1767                                 return false;
1768                         }
1769                         else
1770                         {
1771                                 if (state->current <= 0)
1772                                         return false;
1773
1774                                 /*
1775                                  * if all tuples are fetched already then we return last
1776                                  * tuple, else - tuple before last returned.
1777                                  */
1778                                 if (state->eof_reached)
1779                                         state->eof_reached = false;
1780                                 else
1781                                 {
1782                                         state->current--;       /* last returned tuple */
1783                                         if (state->current <= 0)
1784                                                 return false;
1785                                 }
1786                                 *stup = state->memtuples[state->current - 1];
1787                                 return true;
1788                         }
1789                         break;
1790
1791                 case TSS_SORTEDONTAPE:
1792                         Assert(forward || state->randomAccess);
1793                         Assert(!state->batchUsed);
1794                         *should_free = true;
1795                         if (forward)
1796                         {
1797                                 if (state->eof_reached)
1798                                         return false;
1799                                 if ((tuplen = getlen(state, state->result_tape, true)) != 0)
1800                                 {
1801                                         READTUP(state, stup, state->result_tape, tuplen);
1802                                         return true;
1803                                 }
1804                                 else
1805                                 {
1806                                         state->eof_reached = true;
1807                                         return false;
1808                                 }
1809                         }
1810
1811                         /*
1812                          * Backward.
1813                          *
1814                          * if all tuples are fetched already then we return last tuple,
1815                          * else - tuple before last returned.
1816                          */
1817                         if (state->eof_reached)
1818                         {
1819                                 /*
1820                                  * Seek position is pointing just past the zero tuplen at the
1821                                  * end of file; back up to fetch last tuple's ending length
1822                                  * word.  If seek fails we must have a completely empty file.
1823                                  */
1824                                 if (!LogicalTapeBackspace(state->tapeset,
1825                                                                                   state->result_tape,
1826                                                                                   2 * sizeof(unsigned int)))
1827                                         return false;
1828                                 state->eof_reached = false;
1829                         }
1830                         else
1831                         {
1832                                 /*
1833                                  * Back up and fetch previously-returned tuple's ending length
1834                                  * word.  If seek fails, assume we are at start of file.
1835                                  */
1836                                 if (!LogicalTapeBackspace(state->tapeset,
1837                                                                                   state->result_tape,
1838                                                                                   sizeof(unsigned int)))
1839                                         return false;
1840                                 tuplen = getlen(state, state->result_tape, false);
1841
1842                                 /*
1843                                  * Back up to get ending length word of tuple before it.
1844                                  */
1845                                 if (!LogicalTapeBackspace(state->tapeset,
1846                                                                                   state->result_tape,
1847                                                                                   tuplen + 2 * sizeof(unsigned int)))
1848                                 {
1849                                         /*
1850                                          * If that fails, presumably the prev tuple is the first
1851                                          * in the file.  Back up so that it becomes next to read
1852                                          * in forward direction (not obviously right, but that is
1853                                          * what in-memory case does).
1854                                          */
1855                                         if (!LogicalTapeBackspace(state->tapeset,
1856                                                                                           state->result_tape,
1857                                                                                           tuplen + sizeof(unsigned int)))
1858                                                 elog(ERROR, "bogus tuple length in backward scan");
1859                                         return false;
1860                                 }
1861                         }
1862
1863                         tuplen = getlen(state, state->result_tape, false);
1864
1865                         /*
1866                          * Now we have the length of the prior tuple, back up and read it.
1867                          * Note: READTUP expects we are positioned after the initial
1868                          * length word of the tuple, so back up to that point.
1869                          */
1870                         if (!LogicalTapeBackspace(state->tapeset,
1871                                                                           state->result_tape,
1872                                                                           tuplen))
1873                                 elog(ERROR, "bogus tuple length in backward scan");
1874                         READTUP(state, stup, state->result_tape, tuplen);
1875                         return true;
1876
1877                 case TSS_FINALMERGE:
1878                         Assert(forward);
1879                         Assert(state->batchUsed || !state->tuples);
1880                         /* For now, assume tuple is stored in tape's batch memory */
1881                         *should_free = false;
1882
1883                         /*
1884                          * This code should match the inner loop of mergeonerun().
1885                          */
1886                         if (state->memtupcount > 0)
1887                         {
1888                                 int                     srcTape = state->memtuples[0].tupindex;
1889                                 int                     tupIndex;
1890                                 SortTuple  *newtup;
1891
1892                                 /*
1893                                  * Returned tuple is still counted in our memory space most
1894                                  * of the time.  See mergebatchone() for discussion of why
1895                                  * caller may occasionally be required to free returned
1896                                  * tuple, and how preread memory is managed with regard to
1897                                  * edge cases more generally.
1898                                  */
1899                                 *stup = state->memtuples[0];
1900                                 tuplesort_heap_siftup(state, false);
1901                                 if ((tupIndex = state->mergenext[srcTape]) == 0)
1902                                 {
1903                                         /*
1904                                          * out of preloaded data on this tape, try to read more
1905                                          *
1906                                          * Unlike mergeonerun(), we only preload from the single
1907                                          * tape that's run dry, though not before preparing its
1908                                          * batch memory for a new round of sequential consumption.
1909                                          * See mergepreread() comments.
1910                                          */
1911                                         if (state->batchUsed)
1912                                                 mergebatchone(state, srcTape, stup, should_free);
1913
1914                                         mergeprereadone(state, srcTape);
1915
1916                                         /*
1917                                          * if still no data, we've reached end of run on this tape
1918                                          */
1919                                         if ((tupIndex = state->mergenext[srcTape]) == 0)
1920                                         {
1921                                                 /* Free tape's buffer, avoiding dangling pointer */
1922                                                 if (state->batchUsed)
1923                                                         mergebatchfreetape(state, srcTape, stup, should_free);
1924                                                 return true;
1925                                         }
1926                                 }
1927                                 /* pull next preread tuple from list, insert in heap */
1928                                 newtup = &state->memtuples[tupIndex];
1929                                 state->mergenext[srcTape] = newtup->tupindex;
1930                                 if (state->mergenext[srcTape] == 0)
1931                                         state->mergelast[srcTape] = 0;
1932                                 tuplesort_heap_insert(state, newtup, srcTape, false);
1933                                 /* put the now-unused memtuples entry on the freelist */
1934                                 newtup->tupindex = state->mergefreelist;
1935                                 state->mergefreelist = tupIndex;
1936                                 state->mergeavailslots[srcTape]++;
1937                                 return true;
1938                         }
1939                         return false;
1940
1941                 default:
1942                         elog(ERROR, "invalid tuplesort state");
1943                         return false;           /* keep compiler quiet */
1944         }
1945 }
1946
1947 /*
1948  * Fetch the next tuple in either forward or back direction.
1949  * If successful, put tuple in slot and return TRUE; else, clear the slot
1950  * and return FALSE.
1951  *
1952  * Caller may optionally be passed back abbreviated value (on TRUE return
1953  * value) when abbreviation was used, which can be used to cheaply avoid
1954  * equality checks that might otherwise be required.  Caller can safely make a
1955  * determination of "non-equal tuple" based on simple binary inequality.  A
1956  * NULL value in leading attribute will set abbreviated value to zeroed
1957  * representation, which caller may rely on in abbreviated inequality check.
1958  */
1959 bool
1960 tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
1961                                            TupleTableSlot *slot, Datum *abbrev)
1962 {
1963         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1964         SortTuple       stup;
1965         bool            should_free;
1966
1967         if (!tuplesort_gettuple_common(state, forward, &stup, &should_free))
1968                 stup.tuple = NULL;
1969
1970         MemoryContextSwitchTo(oldcontext);
1971
1972         if (stup.tuple)
1973         {
1974                 /* Record abbreviated key for caller */
1975                 if (state->sortKeys->abbrev_converter && abbrev)
1976                         *abbrev = stup.datum1;
1977
1978                 ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, should_free);
1979                 return true;
1980         }
1981         else
1982         {
1983                 ExecClearTuple(slot);
1984                 return false;
1985         }
1986 }
1987
1988 /*
1989  * Fetch the next tuple in either forward or back direction.
1990  * Returns NULL if no more tuples.  If *should_free is set, the
1991  * caller must pfree the returned tuple when done with it.
1992  * If it is not set, caller should not use tuple following next
1993  * call here.
1994  */
1995 HeapTuple
1996 tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free)
1997 {
1998         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1999         SortTuple       stup;
2000
2001         if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
2002                 stup.tuple = NULL;
2003
2004         MemoryContextSwitchTo(oldcontext);
2005
2006         return stup.tuple;
2007 }
2008
2009 /*
2010  * Fetch the next index tuple in either forward or back direction.
2011  * Returns NULL if no more tuples.  If *should_free is set, the
2012  * caller must pfree the returned tuple when done with it.
2013  * If it is not set, caller should not use tuple following next
2014  * call here.
2015  */
2016 IndexTuple
2017 tuplesort_getindextuple(Tuplesortstate *state, bool forward,
2018                                                 bool *should_free)
2019 {
2020         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2021         SortTuple       stup;
2022
2023         if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
2024                 stup.tuple = NULL;
2025
2026         MemoryContextSwitchTo(oldcontext);
2027
2028         return (IndexTuple) stup.tuple;
2029 }
2030
2031 /*
2032  * Fetch the next Datum in either forward or back direction.
2033  * Returns FALSE if no more datums.
2034  *
2035  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2036  * and is now owned by the caller.
2037  *
2038  * Caller may optionally be passed back abbreviated value (on TRUE return
2039  * value) when abbreviation was used, which can be used to cheaply avoid
2040  * equality checks that might otherwise be required.  Caller can safely make a
2041  * determination of "non-equal tuple" based on simple binary inequality.  A
2042  * NULL value will have a zeroed abbreviated value representation, which caller
2043  * may rely on in abbreviated inequality check.
2044  */
2045 bool
2046 tuplesort_getdatum(Tuplesortstate *state, bool forward,
2047                                    Datum *val, bool *isNull, Datum *abbrev)
2048 {
2049         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2050         SortTuple       stup;
2051         bool            should_free;
2052
2053         if (!tuplesort_gettuple_common(state, forward, &stup, &should_free))
2054         {
2055                 MemoryContextSwitchTo(oldcontext);
2056                 return false;
2057         }
2058
2059         /* Record abbreviated key for caller */
2060         if (state->sortKeys->abbrev_converter && abbrev)
2061                 *abbrev = stup.datum1;
2062
2063         if (stup.isnull1 || !state->tuples)
2064         {
2065                 *val = stup.datum1;
2066                 *isNull = stup.isnull1;
2067         }
2068         else
2069         {
2070                 /* use stup.tuple because stup.datum1 may be an abbreviation */
2071
2072                 if (should_free)
2073                         *val = PointerGetDatum(stup.tuple);
2074                 else
2075                         *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2076                 *isNull = false;
2077         }
2078
2079         MemoryContextSwitchTo(oldcontext);
2080
2081         return true;
2082 }
2083
2084 /*
2085  * Advance over N tuples in either forward or back direction,
2086  * without returning any data.  N==0 is a no-op.
2087  * Returns TRUE if successful, FALSE if ran out of tuples.
2088  */
2089 bool
2090 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2091 {
2092         MemoryContext oldcontext;
2093
2094         /*
2095          * We don't actually support backwards skip yet, because no callers need
2096          * it.  The API is designed to allow for that later, though.
2097          */
2098         Assert(forward);
2099         Assert(ntuples >= 0);
2100
2101         switch (state->status)
2102         {
2103                 case TSS_SORTEDINMEM:
2104                         if (state->memtupcount - state->current >= ntuples)
2105                         {
2106                                 state->current += ntuples;
2107                                 return true;
2108                         }
2109                         state->current = state->memtupcount;
2110                         state->eof_reached = true;
2111
2112                         /*
2113                          * Complain if caller tries to retrieve more tuples than
2114                          * originally asked for in a bounded sort.  This is because
2115                          * returning EOF here might be the wrong thing.
2116                          */
2117                         if (state->bounded && state->current >= state->bound)
2118                                 elog(ERROR, "retrieved too many tuples in a bounded sort");
2119
2120                         return false;
2121
2122                 case TSS_SORTEDONTAPE:
2123                 case TSS_FINALMERGE:
2124
2125                         /*
2126                          * We could probably optimize these cases better, but for now it's
2127                          * not worth the trouble.
2128                          */
2129                         oldcontext = MemoryContextSwitchTo(state->sortcontext);
2130                         while (ntuples-- > 0)
2131                         {
2132                                 SortTuple       stup;
2133                                 bool            should_free;
2134
2135                                 if (!tuplesort_gettuple_common(state, forward,
2136                                                                                            &stup, &should_free))
2137                                 {
2138                                         MemoryContextSwitchTo(oldcontext);
2139                                         return false;
2140                                 }
2141                                 if (should_free && stup.tuple)
2142                                         pfree(stup.tuple);
2143                                 CHECK_FOR_INTERRUPTS();
2144                         }
2145                         MemoryContextSwitchTo(oldcontext);
2146                         return true;
2147
2148                 default:
2149                         elog(ERROR, "invalid tuplesort state");
2150                         return false;           /* keep compiler quiet */
2151         }
2152 }
2153
2154 /*
2155  * tuplesort_merge_order - report merge order we'll use for given memory
2156  * (note: "merge order" just means the number of input tapes in the merge).
2157  *
2158  * This is exported for use by the planner.  allowedMem is in bytes.
2159  */
2160 int
2161 tuplesort_merge_order(int64 allowedMem)
2162 {
2163         int                     mOrder;
2164
2165         /*
2166          * We need one tape for each merge input, plus another one for the output,
2167          * and each of these tapes needs buffer space.  In addition we want
2168          * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2169          * count).
2170          *
2171          * Note: you might be thinking we need to account for the memtuples[]
2172          * array in this calculation, but we effectively treat that as part of the
2173          * MERGE_BUFFER_SIZE workspace.
2174          */
2175         mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2176                 (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
2177
2178         /* Even in minimum memory, use at least a MINORDER merge */
2179         mOrder = Max(mOrder, MINORDER);
2180
2181         return mOrder;
2182 }
2183
2184 /*
2185  * inittapes - initialize for tape sorting.
2186  *
2187  * This is called only if we have found we don't have room to sort in memory.
2188  */
2189 static void
2190 inittapes(Tuplesortstate *state)
2191 {
2192         int                     maxTapes,
2193                                 ntuples,
2194                                 j;
2195         int64           tapeSpace;
2196
2197         /* Compute number of tapes to use: merge order plus 1 */
2198         maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2199
2200         /*
2201          * We must have at least 2*maxTapes slots in the memtuples[] array, else
2202          * we'd not have room for merge heap plus preread.  It seems unlikely that
2203          * this case would ever occur, but be safe.
2204          */
2205         maxTapes = Min(maxTapes, state->memtupsize / 2);
2206
2207         state->maxTapes = maxTapes;
2208         state->tapeRange = maxTapes - 1;
2209
2210 #ifdef TRACE_SORT
2211         if (trace_sort)
2212                 elog(LOG, "switching to external sort with %d tapes: %s",
2213                          maxTapes, pg_rusage_show(&state->ru_start));
2214 #endif
2215
2216         /*
2217          * Decrease availMem to reflect the space needed for tape buffers; but
2218          * don't decrease it to the point that we have no room for tuples. (That
2219          * case is only likely to occur if sorting pass-by-value Datums; in all
2220          * other scenarios the memtuples[] array is unlikely to occupy more than
2221          * half of allowedMem.  In the pass-by-value case it's not important to
2222          * account for tuple space, so we don't care if LACKMEM becomes
2223          * inaccurate.)
2224          */
2225         tapeSpace = (int64) maxTapes *TAPE_BUFFER_OVERHEAD;
2226
2227         if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2228                 USEMEM(state, tapeSpace);
2229
2230         /*
2231          * Make sure that the temp file(s) underlying the tape set are created in
2232          * suitable temp tablespaces.
2233          */
2234         PrepareTempTablespaces();
2235
2236         /*
2237          * Create the tape set and allocate the per-tape data arrays.
2238          */
2239         state->tapeset = LogicalTapeSetCreate(maxTapes);
2240
2241         state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2242         state->mergenext = (int *) palloc0(maxTapes * sizeof(int));
2243         state->mergelast = (int *) palloc0(maxTapes * sizeof(int));
2244         state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int));
2245         state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64));
2246         state->mergetuples = (char **) palloc0(maxTapes * sizeof(char *));
2247         state->mergecurrent = (char **) palloc0(maxTapes * sizeof(char *));
2248         state->mergetail = (char **) palloc0(maxTapes * sizeof(char *));
2249         state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *));
2250         state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2251         state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2252         state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2253         state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2254
2255         /*
2256          * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
2257          * marked as belonging to run number zero.
2258          *
2259          * NOTE: we pass false for checkIndex since there's no point in comparing
2260          * indexes in this step, even though we do intend the indexes to be part
2261          * of the sort key...
2262          */
2263         ntuples = state->memtupcount;
2264         state->memtupcount = 0;         /* make the heap empty */
2265         for (j = 0; j < ntuples; j++)
2266         {
2267                 /* Must copy source tuple to avoid possible overwrite */
2268                 SortTuple       stup = state->memtuples[j];
2269
2270                 tuplesort_heap_insert(state, &stup, 0, false);
2271         }
2272         Assert(state->memtupcount == ntuples);
2273
2274         state->currentRun = 0;
2275
2276         /*
2277          * Initialize variables of Algorithm D (step D1).
2278          */
2279         for (j = 0; j < maxTapes; j++)
2280         {
2281                 state->tp_fib[j] = 1;
2282                 state->tp_runs[j] = 0;
2283                 state->tp_dummy[j] = 1;
2284                 state->tp_tapenum[j] = j;
2285         }
2286         state->tp_fib[state->tapeRange] = 0;
2287         state->tp_dummy[state->tapeRange] = 0;
2288
2289         state->Level = 1;
2290         state->destTape = 0;
2291
2292         state->status = TSS_BUILDRUNS;
2293 }
2294
2295 /*
2296  * selectnewtape -- select new tape for new initial run.
2297  *
2298  * This is called after finishing a run when we know another run
2299  * must be started.  This implements steps D3, D4 of Algorithm D.
2300  */
2301 static void
2302 selectnewtape(Tuplesortstate *state)
2303 {
2304         int                     j;
2305         int                     a;
2306
2307         /* Step D3: advance j (destTape) */
2308         if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2309         {
2310                 state->destTape++;
2311                 return;
2312         }
2313         if (state->tp_dummy[state->destTape] != 0)
2314         {
2315                 state->destTape = 0;
2316                 return;
2317         }
2318
2319         /* Step D4: increase level */
2320         state->Level++;
2321         a = state->tp_fib[0];
2322         for (j = 0; j < state->tapeRange; j++)
2323         {
2324                 state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2325                 state->tp_fib[j] = a + state->tp_fib[j + 1];
2326         }
2327         state->destTape = 0;
2328 }
2329
2330 /*
2331  * mergeruns -- merge all the completed initial runs.
2332  *
2333  * This implements steps D5, D6 of Algorithm D.  All input data has
2334  * already been written to initial runs on tape (see dumptuples).
2335  */
2336 static void
2337 mergeruns(Tuplesortstate *state)
2338 {
2339         int                     tapenum,
2340                                 svTape,
2341                                 svRuns,
2342                                 svDummy;
2343
2344         Assert(state->status == TSS_BUILDRUNS);
2345         Assert(state->memtupcount == 0);
2346
2347         if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2348         {
2349                 /*
2350                  * If there are multiple runs to be merged, when we go to read back
2351                  * tuples from disk, abbreviated keys will not have been stored, and
2352                  * we don't care to regenerate them.  Disable abbreviation from this
2353                  * point on.
2354                  */
2355                 state->sortKeys->abbrev_converter = NULL;
2356                 state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
2357
2358                 /* Not strictly necessary, but be tidy */
2359                 state->sortKeys->abbrev_abort = NULL;
2360                 state->sortKeys->abbrev_full_comparator = NULL;
2361         }
2362
2363         /*
2364          * If we produced only one initial run (quite likely if the total data
2365          * volume is between 1X and 2X workMem), we can just use that tape as the
2366          * finished output, rather than doing a useless merge.  (This obvious
2367          * optimization is not in Knuth's algorithm.)
2368          */
2369         if (state->currentRun == 1)
2370         {
2371                 state->result_tape = state->tp_tapenum[state->destTape];
2372                 /* must freeze and rewind the finished output tape */
2373                 LogicalTapeFreeze(state->tapeset, state->result_tape);
2374                 state->status = TSS_SORTEDONTAPE;
2375                 return;
2376         }
2377
2378         /* End of step D2: rewind all output tapes to prepare for merging */
2379         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2380                 LogicalTapeRewind(state->tapeset, tapenum, false);
2381
2382         for (;;)
2383         {
2384                 /*
2385                  * At this point we know that tape[T] is empty.  If there's just one
2386                  * (real or dummy) run left on each input tape, then only one merge
2387                  * pass remains.  If we don't have to produce a materialized sorted
2388                  * tape, we can stop at this point and do the final merge on-the-fly.
2389                  */
2390                 if (!state->randomAccess)
2391                 {
2392                         bool            allOneRun = true;
2393
2394                         Assert(state->tp_runs[state->tapeRange] == 0);
2395                         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2396                         {
2397                                 if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2398                                 {
2399                                         allOneRun = false;
2400                                         break;
2401                                 }
2402                         }
2403                         if (allOneRun)
2404                         {
2405                                 /* Tell logtape.c we won't be writing anymore */
2406                                 LogicalTapeSetForgetFreeSpace(state->tapeset);
2407                                 /* Initialize for the final merge pass */
2408                                 beginmerge(state, state->tuples);
2409                                 state->status = TSS_FINALMERGE;
2410                                 return;
2411                         }
2412                 }
2413
2414                 /* Step D5: merge runs onto tape[T] until tape[P] is empty */
2415                 while (state->tp_runs[state->tapeRange - 1] ||
2416                            state->tp_dummy[state->tapeRange - 1])
2417                 {
2418                         bool            allDummy = true;
2419
2420                         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2421                         {
2422                                 if (state->tp_dummy[tapenum] == 0)
2423                                 {
2424                                         allDummy = false;
2425                                         break;
2426                                 }
2427                         }
2428
2429                         if (allDummy)
2430                         {
2431                                 state->tp_dummy[state->tapeRange]++;
2432                                 for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2433                                         state->tp_dummy[tapenum]--;
2434                         }
2435                         else
2436                                 mergeonerun(state);
2437                 }
2438
2439                 /* Step D6: decrease level */
2440                 if (--state->Level == 0)
2441                         break;
2442                 /* rewind output tape T to use as new input */
2443                 LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange],
2444                                                   false);
2445                 /* rewind used-up input tape P, and prepare it for write pass */
2446                 LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1],
2447                                                   true);
2448                 state->tp_runs[state->tapeRange - 1] = 0;
2449
2450                 /*
2451                  * reassign tape units per step D6; note we no longer care about A[]
2452                  */
2453                 svTape = state->tp_tapenum[state->tapeRange];
2454                 svDummy = state->tp_dummy[state->tapeRange];
2455                 svRuns = state->tp_runs[state->tapeRange];
2456                 for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2457                 {
2458                         state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2459                         state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2460                         state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2461                 }
2462                 state->tp_tapenum[0] = svTape;
2463                 state->tp_dummy[0] = svDummy;
2464                 state->tp_runs[0] = svRuns;
2465         }
2466
2467         /*
2468          * Done.  Knuth says that the result is on TAPE[1], but since we exited
2469          * the loop without performing the last iteration of step D6, we have not
2470          * rearranged the tape unit assignment, and therefore the result is on
2471          * TAPE[T].  We need to do it this way so that we can freeze the final
2472          * output tape while rewinding it.  The last iteration of step D6 would be
2473          * a waste of cycles anyway...
2474          */
2475         state->result_tape = state->tp_tapenum[state->tapeRange];
2476         LogicalTapeFreeze(state->tapeset, state->result_tape);
2477         state->status = TSS_SORTEDONTAPE;
2478 }
2479
2480 /*
2481  * Merge one run from each input tape, except ones with dummy runs.
2482  *
2483  * This is the inner loop of Algorithm D step D5.  We know that the
2484  * output tape is TAPE[T].
2485  */
2486 static void
2487 mergeonerun(Tuplesortstate *state)
2488 {
2489         int                     destTape = state->tp_tapenum[state->tapeRange];
2490         int                     srcTape;
2491         int                     tupIndex;
2492         SortTuple  *tup;
2493         int64           priorAvail,
2494                                 spaceFreed;
2495
2496         /*
2497          * Start the merge by loading one tuple from each active source tape into
2498          * the heap.  We can also decrease the input run/dummy run counts.
2499          */
2500         beginmerge(state, false);
2501
2502         /*
2503          * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2504          * out, and replacing it with next tuple from same tape (if there is
2505          * another one).
2506          */
2507         while (state->memtupcount > 0)
2508         {
2509                 /* write the tuple to destTape */
2510                 priorAvail = state->availMem;
2511                 srcTape = state->memtuples[0].tupindex;
2512                 WRITETUP(state, destTape, &state->memtuples[0]);
2513                 /* writetup adjusted total free space, now fix per-tape space */
2514                 spaceFreed = state->availMem - priorAvail;
2515                 state->mergeavailmem[srcTape] += spaceFreed;
2516                 /* compact the heap */
2517                 tuplesort_heap_siftup(state, false);
2518                 if ((tupIndex = state->mergenext[srcTape]) == 0)
2519                 {
2520                         /* out of preloaded data on this tape, try to read more */
2521                         mergepreread(state);
2522                         /* if still no data, we've reached end of run on this tape */
2523                         if ((tupIndex = state->mergenext[srcTape]) == 0)
2524                                 continue;
2525                 }
2526                 /* pull next preread tuple from list, insert in heap */
2527                 tup = &state->memtuples[tupIndex];
2528                 state->mergenext[srcTape] = tup->tupindex;
2529                 if (state->mergenext[srcTape] == 0)
2530                         state->mergelast[srcTape] = 0;
2531                 tuplesort_heap_insert(state, tup, srcTape, false);
2532                 /* put the now-unused memtuples entry on the freelist */
2533                 tup->tupindex = state->mergefreelist;
2534                 state->mergefreelist = tupIndex;
2535                 state->mergeavailslots[srcTape]++;
2536         }
2537
2538         /*
2539          * Reset tuple memory.  We've freed all of the tuples that we previously
2540          * allocated, but AllocSetFree will have put those chunks of memory on
2541          * particular free lists, bucketed by size class.  Thus, although all of
2542          * that memory is free, it is effectively fragmented.  Resetting the
2543          * context gets us out from under that problem.
2544          */
2545         MemoryContextReset(state->tuplecontext);
2546
2547         /*
2548          * When the heap empties, we're done.  Write an end-of-run marker on the
2549          * output tape, and increment its count of real runs.
2550          */
2551         markrunend(state, destTape);
2552         state->tp_runs[state->tapeRange]++;
2553
2554 #ifdef TRACE_SORT
2555         if (trace_sort)
2556                 elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
2557                          pg_rusage_show(&state->ru_start));
2558 #endif
2559 }
2560
2561 /*
2562  * beginmerge - initialize for a merge pass
2563  *
2564  * We decrease the counts of real and dummy runs for each tape, and mark
2565  * which tapes contain active input runs in mergeactive[].  Then, load
2566  * as many tuples as we can from each active input tape, and finally
2567  * fill the merge heap with the first tuple from each active tape.
2568  *
2569  * finalMergeBatch indicates if this is the beginning of a final on-the-fly
2570  * merge where a batched allocation of tuple memory is required.
2571  */
2572 static void
2573 beginmerge(Tuplesortstate *state, bool finalMergeBatch)
2574 {
2575         int                     activeTapes;
2576         int                     tapenum;
2577         int                     srcTape;
2578         int                     slotsPerTape;
2579         int64           spacePerTape;
2580
2581         /* Heap should be empty here */
2582         Assert(state->memtupcount == 0);
2583
2584         /* Adjust run counts and mark the active tapes */
2585         memset(state->mergeactive, 0,
2586                    state->maxTapes * sizeof(*state->mergeactive));
2587         activeTapes = 0;
2588         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2589         {
2590                 if (state->tp_dummy[tapenum] > 0)
2591                         state->tp_dummy[tapenum]--;
2592                 else
2593                 {
2594                         Assert(state->tp_runs[tapenum] > 0);
2595                         state->tp_runs[tapenum]--;
2596                         srcTape = state->tp_tapenum[tapenum];
2597                         state->mergeactive[srcTape] = true;
2598                         activeTapes++;
2599                 }
2600         }
2601         state->activeTapes = activeTapes;
2602
2603         /* Clear merge-pass state variables */
2604         memset(state->mergenext, 0,
2605                    state->maxTapes * sizeof(*state->mergenext));
2606         memset(state->mergelast, 0,
2607                    state->maxTapes * sizeof(*state->mergelast));
2608         state->mergefreelist = 0;       /* nothing in the freelist */
2609         state->mergefirstfree = activeTapes;            /* 1st slot avail for preread */
2610
2611         if (finalMergeBatch)
2612         {
2613                 /* Free outright buffers for tape never actually allocated */
2614                 FREEMEM(state, (state->maxTapes - activeTapes) * TAPE_BUFFER_OVERHEAD);
2615
2616                 /*
2617                  * Grow memtuples one last time, since the palloc() overhead no longer
2618                  * incurred can make a big difference
2619                  */
2620                 batchmemtuples(state);
2621         }
2622
2623         /*
2624          * Initialize space allocation to let each active input tape have an equal
2625          * share of preread space.
2626          */
2627         Assert(activeTapes > 0);
2628         slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
2629         Assert(slotsPerTape > 0);
2630         spacePerTape = MAXALIGN_DOWN(state->availMem / activeTapes);
2631         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2632         {
2633                 if (state->mergeactive[srcTape])
2634                 {
2635                         state->mergeavailslots[srcTape] = slotsPerTape;
2636                         state->mergeavailmem[srcTape] = spacePerTape;
2637                 }
2638         }
2639
2640         /*
2641          * Preallocate tuple batch memory for each tape.  This is the memory used
2642          * for tuples themselves (not SortTuples), so it's never used by
2643          * pass-by-value datum sorts.  Memory allocation is performed here at most
2644          * once per sort, just in advance of the final on-the-fly merge step.
2645          */
2646         if (finalMergeBatch)
2647                 mergebatch(state, spacePerTape);
2648
2649         /*
2650          * Preread as many tuples as possible (and at least one) from each active
2651          * tape
2652          */
2653         mergepreread(state);
2654
2655         /* Load the merge heap with the first tuple from each input tape */
2656         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2657         {
2658                 int                     tupIndex = state->mergenext[srcTape];
2659                 SortTuple  *tup;
2660
2661                 if (tupIndex)
2662                 {
2663                         tup = &state->memtuples[tupIndex];
2664                         state->mergenext[srcTape] = tup->tupindex;
2665                         if (state->mergenext[srcTape] == 0)
2666                                 state->mergelast[srcTape] = 0;
2667                         tuplesort_heap_insert(state, tup, srcTape, false);
2668                         /* put the now-unused memtuples entry on the freelist */
2669                         tup->tupindex = state->mergefreelist;
2670                         state->mergefreelist = tupIndex;
2671                         state->mergeavailslots[srcTape]++;
2672
2673 #ifdef TRACE_SORT
2674                         if (trace_sort && finalMergeBatch)
2675                         {
2676                                 int64           perTapeKB = (spacePerTape + 1023) / 1024;
2677                                 int64           usedSpaceKB;
2678                                 int                     usedSlots;
2679
2680                                 /*
2681                                  * Report how effective batchmemtuples() was in balancing
2682                                  * the number of slots against the need for memory for the
2683                                  * underlying tuples (e.g. IndexTuples).  The big preread of
2684                                  * all tapes when switching to FINALMERGE state should be
2685                                  * fairly representative of memory utilization during the
2686                                  * final merge step, and in any case is the only point at
2687                                  * which all tapes are guaranteed to have depleted either
2688                                  * their batch memory allowance or slot allowance.  Ideally,
2689                                  * both will be completely depleted for every tape by now.
2690                                  */
2691                                 usedSpaceKB = (state->mergecurrent[srcTape] -
2692                                                            state->mergetuples[srcTape] + 1023) / 1024;
2693                                 usedSlots = slotsPerTape - state->mergeavailslots[srcTape];
2694
2695                                 elog(LOG, "tape %d initially used %ld KB of %ld KB batch "
2696                                          "(%2.3f) and %d out of %d slots (%2.3f)", srcTape,
2697                                          usedSpaceKB, perTapeKB,
2698                                          (double) usedSpaceKB / (double) perTapeKB,
2699                                          usedSlots, slotsPerTape,
2700                                          (double) usedSlots / (double) slotsPerTape);
2701                         }
2702 #endif
2703                 }
2704         }
2705 }
2706
2707 /*
2708  * batchmemtuples - grow memtuples without palloc overhead
2709  *
2710  * When called, availMem should be approximately the amount of memory we'd
2711  * require to allocate memtupsize - memtupcount tuples (not SortTuples/slots)
2712  * that were allocated with palloc() overhead, and in doing so use up all
2713  * allocated slots.  However, though slots and tuple memory is in balance
2714  * following the last grow_memtuples() call, that's predicated on the observed
2715  * average tuple size for the "final" grow_memtuples() call, which includes
2716  * palloc overhead.  During the final merge pass, where we will arrange to
2717  * squeeze out the palloc overhead, we might need more slots in the memtuples
2718  * array.
2719  *
2720  * To make that happen, arrange for the amount of remaining memory to be
2721  * exactly equal to the palloc overhead multiplied by the current size of
2722  * the memtuples array, force the grow_memtuples flag back to true (it's
2723  * probably but not necessarily false on entry to this routine), and then
2724  * call grow_memtuples.  This simulates loading enough tuples to fill the
2725  * whole memtuples array and then having some space left over because of the
2726  * elided palloc overhead.  We expect that grow_memtuples() will conclude that
2727  * it can't double the size of the memtuples array but that it can increase
2728  * it by some percentage; but if it does decide to double it, that just means
2729  * that we've never managed to use many slots in the memtuples array, in which
2730  * case doubling it shouldn't hurt anything anyway.
2731  */
2732 static void
2733 batchmemtuples(Tuplesortstate *state)
2734 {
2735         int64                   refund;
2736         int64                   availMemLessRefund;
2737         int                             memtupsize = state->memtupsize;
2738
2739         /* For simplicity, assume no memtuples are actually currently counted */
2740         Assert(state->memtupcount == 0);
2741
2742         /*
2743          * Refund STANDARDCHUNKHEADERSIZE per tuple.
2744          *
2745          * This sometimes fails to make memory use perfectly balanced, but it
2746          * should never make the situation worse.  Note that Assert-enabled builds
2747          * get a larger refund, due to a varying STANDARDCHUNKHEADERSIZE.
2748          */
2749         refund = memtupsize * STANDARDCHUNKHEADERSIZE;
2750         availMemLessRefund = state->availMem - refund;
2751
2752         /*
2753          * To establish balanced memory use after refunding palloc overhead,
2754          * temporarily have our accounting indicate that we've allocated all
2755          * memory we're allowed to less that refund, and call grow_memtuples()
2756          * to have it increase the number of slots.
2757          */
2758         state->growmemtuples = true;
2759         USEMEM(state, availMemLessRefund);
2760         (void) grow_memtuples(state);
2761         /* Should not matter, but be tidy */
2762         FREEMEM(state, availMemLessRefund);
2763         state->growmemtuples = false;
2764
2765 #ifdef TRACE_SORT
2766         if (trace_sort)
2767         {
2768                 Size    OldKb = (memtupsize * sizeof(SortTuple) + 1023) / 1024;
2769                 Size    NewKb = (state->memtupsize * sizeof(SortTuple) + 1023) / 1024;
2770
2771                 elog(LOG, "grew memtuples %1.2fx from %d (%zu KB) to %d (%zu KB) for final merge",
2772                          (double) NewKb / (double) OldKb,
2773                          memtupsize, OldKb,
2774                          state->memtupsize, NewKb);
2775         }
2776 #endif
2777 }
2778
2779 /*
2780  * mergebatch - initialize tuple memory in batch
2781  *
2782  * This allows sequential access to sorted tuples buffered in memory from
2783  * tapes/runs on disk during a final on-the-fly merge step.  Note that the
2784  * memory is not used for SortTuples, but for the underlying tuples (e.g.
2785  * MinimalTuples).
2786  *
2787  * Note that when batch memory is used, there is a simple division of space
2788  * into large buffers (one per active tape).  The conventional incremental
2789  * memory accounting (calling USEMEM() and FREEMEM()) is abandoned.  Instead,
2790  * when each tape's memory budget is exceeded, a retail palloc() "overflow" is
2791  * performed, which is then immediately detected in a way that is analogous to
2792  * LACKMEM().  This keeps each tape's use of memory fair, which is always a
2793  * goal.
2794  */
2795 static void
2796 mergebatch(Tuplesortstate *state, int64 spacePerTape)
2797 {
2798         int             srcTape;
2799
2800         Assert(state->activeTapes > 0);
2801         Assert(state->tuples);
2802
2803         /*
2804          * For the purposes of tuplesort's memory accounting, the batch allocation
2805          * is special, and regular memory accounting through USEMEM() calls is
2806          * abandoned (see mergeprereadone()).
2807          */
2808         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2809         {
2810                 char       *mergetuples;
2811
2812                 if (!state->mergeactive[srcTape])
2813                         continue;
2814
2815                 /* Allocate buffer for each active tape */
2816                 mergetuples = MemoryContextAllocHuge(state->tuplecontext,
2817                                                                                          spacePerTape);
2818
2819                 /* Initialize state for tape */
2820                 state->mergetuples[srcTape] = mergetuples;
2821                 state->mergecurrent[srcTape] = mergetuples;
2822                 state->mergetail[srcTape] = mergetuples;
2823                 state->mergeoverflow[srcTape] = NULL;
2824         }
2825
2826         state->batchUsed = true;
2827         state->spacePerTape = spacePerTape;
2828 }
2829
2830 /*
2831  * mergebatchone - prepare batch memory for one merge input tape
2832  *
2833  * This is called following the exhaustion of preread tuples for one input
2834  * tape.  All that actually occurs is that the state for the source tape is
2835  * reset to indicate that all memory may be reused.
2836  *
2837  * This routine must deal with fixing up the tuple that is about to be returned
2838  * to the client, due to "overflow" allocations.
2839  */
2840 static void
2841 mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup,
2842                           bool *should_free)
2843 {
2844         Assert(state->batchUsed);
2845
2846         /*
2847          * Tuple about to be returned to caller ("stup") is final preread tuple
2848          * from tape, just removed from the top of the heap.  Special steps around
2849          * memory management must be performed for that tuple, to make sure it
2850          * isn't overwritten early.
2851          */
2852         if (!state->mergeoverflow[srcTape])
2853         {
2854                 Size    tupLen;
2855
2856                 /*
2857                  * Mark tuple buffer range for reuse, but be careful to move final,
2858                  * tail tuple to start of space for next run so that it's available
2859                  * to caller when stup is returned, and remains available at least
2860                  * until the next tuple is requested.
2861                  */
2862                 tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
2863                 state->mergecurrent[srcTape] = state->mergetuples[srcTape];
2864                 memmove(state->mergecurrent[srcTape], state->mergetail[srcTape],
2865                                 tupLen);
2866
2867                 /* Make SortTuple at top of the merge heap point to new tuple */
2868                 rtup->tuple = (void *) state->mergecurrent[srcTape];
2869
2870                 state->mergetail[srcTape] = state->mergecurrent[srcTape];
2871                 state->mergecurrent[srcTape] += tupLen;
2872         }
2873         else
2874         {
2875                 /*
2876                  * Handle an "overflow" retail palloc.
2877                  *
2878                  * This is needed when we run out of tuple memory for the tape.
2879                  */
2880                 state->mergecurrent[srcTape] = state->mergetuples[srcTape];
2881                 state->mergetail[srcTape] = state->mergetuples[srcTape];
2882
2883                 if (rtup->tuple)
2884                 {
2885                         Assert(rtup->tuple == (void *) state->mergeoverflow[srcTape]);
2886                         /* Caller should free palloc'd tuple */
2887                         *should_free = true;
2888                 }
2889                 state->mergeoverflow[srcTape] = NULL;
2890         }
2891 }
2892
2893 /*
2894  * mergebatchfreetape - handle final clean-up for batch memory once tape is
2895  * about to become exhausted
2896  *
2897  * All tuples are returned from tape, but a single final tuple, *rtup, is to be
2898  * passed back to caller.  Free tape's batch allocation buffer while ensuring
2899  * that the final tuple is managed appropriately.
2900  */
2901 static void
2902 mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup,
2903                                    bool *should_free)
2904 {
2905         Assert(state->batchUsed);
2906         Assert(state->status == TSS_FINALMERGE);
2907
2908         /*
2909          * Tuple may or may not already be an overflow allocation from
2910          * mergebatchone()
2911          */
2912         if (!*should_free && rtup->tuple)
2913         {
2914                 /*
2915                  * Final tuple still in tape's batch allocation.
2916                  *
2917                  * Return palloc()'d copy to caller, and have it freed in a similar
2918                  * manner to overflow allocation.  Otherwise, we'd free batch memory
2919                  * and pass back a pointer to garbage.  Note that we deliberately
2920                  * allocate this in the parent tuplesort context, to be on the safe
2921                  * side.
2922                  */
2923                 Size            tuplen;
2924                 void       *oldTuple = rtup->tuple;
2925
2926                 tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
2927                 rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen);
2928                 memcpy(rtup->tuple, oldTuple, tuplen);
2929                 *should_free = true;
2930         }
2931
2932         /* Free spacePerTape-sized buffer */
2933         pfree(state->mergetuples[srcTape]);
2934 }
2935
2936 /*
2937  * mergebatchalloc - allocate memory for one tuple using a batch memory
2938  * "logical allocation".
2939  *
2940  * This is used for the final on-the-fly merge phase only.  READTUP() routines
2941  * receive memory from here in place of palloc() and USEMEM() calls.
2942  *
2943  * Tuple tapenum is passed, ensuring each tape's tuples are stored in sorted,
2944  * contiguous order (while allowing safe reuse of memory made available to
2945  * each tape).  This maximizes locality of access as tuples are returned by
2946  * final merge.
2947  *
2948  * Caller must not subsequently attempt to free memory returned here.  In
2949  * general, only mergebatch* functions know about how memory returned from
2950  * here should be freed, and this function's caller must ensure that batch
2951  * memory management code will definitely have the opportunity to do the right
2952  * thing during the final on-the-fly merge.
2953  */
2954 static void *
2955 mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen)
2956 {
2957         Size            reserve_tuplen = MAXALIGN(tuplen);
2958         char       *ret;
2959
2960         /* Should overflow at most once before mergebatchone() call: */
2961         Assert(state->mergeoverflow[tapenum] == NULL);
2962         Assert(state->batchUsed);
2963
2964         /* It should be possible to use precisely spacePerTape memory at once */
2965         if (state->mergecurrent[tapenum] + reserve_tuplen <=
2966                 state->mergetuples[tapenum] + state->spacePerTape)
2967         {
2968                 /*
2969                  * Usual case -- caller is returned pointer into its tape's buffer, and
2970                  * an offset from that point is recorded as where tape has consumed up
2971                  * to for current round of preloading.
2972                  */
2973                 ret = state->mergetail[tapenum] = state->mergecurrent[tapenum];
2974                 state->mergecurrent[tapenum] += reserve_tuplen;
2975         }
2976         else
2977         {
2978                 /*
2979                  * Allocate memory, and record as tape's overflow allocation.  This
2980                  * will be detected quickly, in a similar fashion to a LACKMEM()
2981                  * condition, and should not happen again before a new round of
2982                  * preloading for caller's tape.  Note that we deliberately allocate
2983                  * this in the parent tuplesort context, to be on the safe side.
2984                  *
2985                  * Sometimes, this does not happen because merging runs out of slots
2986                  * before running out of memory.
2987                  */
2988                 ret = state->mergeoverflow[tapenum] =
2989                         MemoryContextAlloc(state->sortcontext, tuplen);
2990         }
2991
2992         return ret;
2993 }
2994
2995 /*
2996  * mergepreread - load tuples from merge input tapes
2997  *
2998  * This routine exists to improve sequentiality of reads during a merge pass,
2999  * as explained in the header comments of this file.  Load tuples from each
3000  * active source tape until the tape's run is exhausted or it has used up
3001  * its fair share of available memory.  In any case, we guarantee that there
3002  * is at least one preread tuple available from each unexhausted input tape.
3003  *
3004  * We invoke this routine at the start of a merge pass for initial load,
3005  * and then whenever any tape's preread data runs out.  Note that we load
3006  * as much data as possible from all tapes, not just the one that ran out.
3007  * This is because logtape.c works best with a usage pattern that alternates
3008  * between reading a lot of data and writing a lot of data, so whenever we
3009  * are forced to read, we should fill working memory completely.
3010  *
3011  * In FINALMERGE state, we *don't* use this routine, but instead just preread
3012  * from the single tape that ran dry.  There's no read/write alternation in
3013  * that state and so no point in scanning through all the tapes to fix one.
3014  * (Moreover, there may be quite a lot of inactive tapes in that state, since
3015  * we might have had many fewer runs than tapes.  In a regular tape-to-tape
3016  * merge we can expect most of the tapes to be active.  Plus, only
3017  * FINALMERGE state has to consider memory management for a batch
3018  * allocation.)
3019  */
3020 static void
3021 mergepreread(Tuplesortstate *state)
3022 {
3023         int                     srcTape;
3024
3025         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
3026                 mergeprereadone(state, srcTape);
3027 }
3028
3029 /*
3030  * mergeprereadone - load tuples from one merge input tape
3031  *
3032  * Read tuples from the specified tape until it has used up its free memory
3033  * or array slots; but ensure that we have at least one tuple, if any are
3034  * to be had.
3035  */
3036 static void
3037 mergeprereadone(Tuplesortstate *state, int srcTape)
3038 {
3039         unsigned int tuplen;
3040         SortTuple       stup;
3041         int                     tupIndex;
3042         int64           priorAvail,
3043                                 spaceUsed;
3044
3045         if (!state->mergeactive[srcTape])
3046                 return;                                 /* tape's run is already exhausted */
3047
3048         /*
3049          * Manage per-tape availMem.  Only actually matters when batch memory not
3050          * in use.
3051          */
3052         priorAvail = state->availMem;
3053         state->availMem = state->mergeavailmem[srcTape];
3054
3055         /*
3056          * When batch memory is used if final on-the-fly merge, only mergeoverflow
3057          * test is relevant; otherwise, only LACKMEM() test is relevant.
3058          */
3059         while ((state->mergeavailslots[srcTape] > 0 &&
3060                         state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
3061                    state->mergenext[srcTape] == 0)
3062         {
3063                 /* read next tuple, if any */
3064                 if ((tuplen = getlen(state, srcTape, true)) == 0)
3065                 {
3066                         state->mergeactive[srcTape] = false;
3067                         break;
3068                 }
3069                 READTUP(state, &stup, srcTape, tuplen);
3070                 /* find a free slot in memtuples[] for it */
3071                 tupIndex = state->mergefreelist;
3072                 if (tupIndex)
3073                         state->mergefreelist = state->memtuples[tupIndex].tupindex;
3074                 else
3075                 {
3076                         tupIndex = state->mergefirstfree++;
3077                         Assert(tupIndex < state->memtupsize);
3078                 }
3079                 state->mergeavailslots[srcTape]--;
3080                 /* store tuple, append to list for its tape */
3081                 stup.tupindex = 0;
3082                 state->memtuples[tupIndex] = stup;
3083                 if (state->mergelast[srcTape])
3084                         state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
3085                 else
3086                         state->mergenext[srcTape] = tupIndex;
3087                 state->mergelast[srcTape] = tupIndex;
3088         }
3089         /* update per-tape and global availmem counts */
3090         spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
3091         state->mergeavailmem[srcTape] = state->availMem;
3092         state->availMem = priorAvail - spaceUsed;
3093 }
3094
3095 /*
3096  * dumptuples - remove tuples from heap and write to tape
3097  *
3098  * This is used during initial-run building, but not during merging.
3099  *
3100  * When alltuples = false, dump only enough tuples to get under the
3101  * availMem limit (and leave at least one tuple in the heap in any case,
3102  * since puttuple assumes it always has a tuple to compare to).  We also
3103  * insist there be at least one free slot in the memtuples[] array.
3104  *
3105  * When alltuples = true, dump everything currently in memory.
3106  * (This case is only used at end of input data.)
3107  *
3108  * If we empty the heap, close out the current run and return (this should
3109  * only happen at end of input data).  If we see that the tuple run number
3110  * at the top of the heap has changed, start a new run.
3111  */
3112 static void
3113 dumptuples(Tuplesortstate *state, bool alltuples)
3114 {
3115         while (alltuples ||
3116                    (LACKMEM(state) && state->memtupcount > 1) ||
3117                    state->memtupcount >= state->memtupsize)
3118         {
3119                 /*
3120                  * Dump the heap's frontmost entry, and sift up to remove it from the
3121                  * heap.
3122                  */
3123                 Assert(state->memtupcount > 0);
3124                 WRITETUP(state, state->tp_tapenum[state->destTape],
3125                                  &state->memtuples[0]);
3126                 tuplesort_heap_siftup(state, true);
3127
3128                 /*
3129                  * If the heap is empty *or* top run number has changed, we've
3130                  * finished the current run.
3131                  */
3132                 if (state->memtupcount == 0 ||
3133                         state->currentRun != state->memtuples[0].tupindex)
3134                 {
3135                         markrunend(state, state->tp_tapenum[state->destTape]);
3136                         state->currentRun++;
3137                         state->tp_runs[state->destTape]++;
3138                         state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
3139
3140 #ifdef TRACE_SORT
3141                         if (trace_sort)
3142                                 elog(LOG, "finished writing%s run %d to tape %d: %s",
3143                                          (state->memtupcount == 0) ? " final" : "",
3144                                          state->currentRun, state->destTape,
3145                                          pg_rusage_show(&state->ru_start));
3146 #endif
3147
3148                         /*
3149                          * Done if heap is empty, else prepare for new run.
3150                          */
3151                         if (state->memtupcount == 0)
3152                                 break;
3153                         Assert(state->currentRun == state->memtuples[0].tupindex);
3154                         selectnewtape(state);
3155                 }
3156         }
3157 }
3158
3159 /*
3160  * tuplesort_rescan             - rewind and replay the scan
3161  */
3162 void
3163 tuplesort_rescan(Tuplesortstate *state)
3164 {
3165         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3166
3167         Assert(state->randomAccess);
3168
3169         switch (state->status)
3170         {
3171                 case TSS_SORTEDINMEM:
3172                         state->current = 0;
3173                         state->eof_reached = false;
3174                         state->markpos_offset = 0;
3175                         state->markpos_eof = false;
3176                         break;
3177                 case TSS_SORTEDONTAPE:
3178                         LogicalTapeRewind(state->tapeset,
3179                                                           state->result_tape,
3180                                                           false);
3181                         state->eof_reached = false;
3182                         state->markpos_block = 0L;
3183                         state->markpos_offset = 0;
3184                         state->markpos_eof = false;
3185                         break;
3186                 default:
3187                         elog(ERROR, "invalid tuplesort state");
3188                         break;
3189         }
3190
3191         MemoryContextSwitchTo(oldcontext);
3192 }
3193
3194 /*
3195  * tuplesort_markpos    - saves current position in the merged sort file
3196  */
3197 void
3198 tuplesort_markpos(Tuplesortstate *state)
3199 {
3200         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3201
3202         Assert(state->randomAccess);
3203
3204         switch (state->status)
3205         {
3206                 case TSS_SORTEDINMEM:
3207                         state->markpos_offset = state->current;
3208                         state->markpos_eof = state->eof_reached;
3209                         break;
3210                 case TSS_SORTEDONTAPE:
3211                         LogicalTapeTell(state->tapeset,
3212                                                         state->result_tape,
3213                                                         &state->markpos_block,
3214                                                         &state->markpos_offset);
3215                         state->markpos_eof = state->eof_reached;
3216                         break;
3217                 default:
3218                         elog(ERROR, "invalid tuplesort state");
3219                         break;
3220         }
3221
3222         MemoryContextSwitchTo(oldcontext);
3223 }
3224
3225 /*
3226  * tuplesort_restorepos - restores current position in merged sort file to
3227  *                                                last saved position
3228  */
3229 void
3230 tuplesort_restorepos(Tuplesortstate *state)
3231 {
3232         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3233
3234         Assert(state->randomAccess);
3235
3236         switch (state->status)
3237         {
3238                 case TSS_SORTEDINMEM:
3239                         state->current = state->markpos_offset;
3240                         state->eof_reached = state->markpos_eof;
3241                         break;
3242                 case TSS_SORTEDONTAPE:
3243                         if (!LogicalTapeSeek(state->tapeset,
3244                                                                  state->result_tape,
3245                                                                  state->markpos_block,
3246                                                                  state->markpos_offset))
3247                                 elog(ERROR, "tuplesort_restorepos failed");
3248                         state->eof_reached = state->markpos_eof;
3249                         break;
3250                 default:
3251                         elog(ERROR, "invalid tuplesort state");
3252                         break;
3253         }
3254
3255         MemoryContextSwitchTo(oldcontext);
3256 }
3257
3258 /*
3259  * tuplesort_get_stats - extract summary statistics
3260  *
3261  * This can be called after tuplesort_performsort() finishes to obtain
3262  * printable summary information about how the sort was performed.
3263  * spaceUsed is measured in kilobytes.
3264  */
3265 void
3266 tuplesort_get_stats(Tuplesortstate *state,
3267                                         const char **sortMethod,
3268                                         const char **spaceType,
3269                                         long *spaceUsed)
3270 {
3271         /*
3272          * Note: it might seem we should provide both memory and disk usage for a
3273          * disk-based sort.  However, the current code doesn't track memory space
3274          * accurately once we have begun to return tuples to the caller (since we
3275          * don't account for pfree's the caller is expected to do), so we cannot
3276          * rely on availMem in a disk sort.  This does not seem worth the overhead
3277          * to fix.  Is it worth creating an API for the memory context code to
3278          * tell us how much is actually used in sortcontext?
3279          */
3280         if (state->tapeset)
3281         {
3282                 *spaceType = "Disk";
3283                 *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
3284         }
3285         else
3286         {
3287                 *spaceType = "Memory";
3288                 *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
3289         }
3290
3291         switch (state->status)
3292         {
3293                 case TSS_SORTEDINMEM:
3294                         if (state->boundUsed)
3295                                 *sortMethod = "top-N heapsort";
3296                         else
3297                                 *sortMethod = "quicksort";
3298                         break;
3299                 case TSS_SORTEDONTAPE:
3300                         *sortMethod = "external sort";
3301                         break;
3302                 case TSS_FINALMERGE:
3303                         *sortMethod = "external merge";
3304                         break;
3305                 default:
3306                         *sortMethod = "still in progress";
3307                         break;
3308         }
3309 }
3310
3311
3312 /*
3313  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3314  *
3315  * Compare two SortTuples.  If checkIndex is true, use the tuple index
3316  * as the front of the sort key; otherwise, no.
3317  */
3318
3319 #define HEAPCOMPARE(tup1,tup2) \
3320         (checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \
3321          ((tup1)->tupindex) - ((tup2)->tupindex) : \
3322          COMPARETUP(state, tup1, tup2))
3323
3324 /*
3325  * Convert the existing unordered array of SortTuples to a bounded heap,
3326  * discarding all but the smallest "state->bound" tuples.
3327  *
3328  * When working with a bounded heap, we want to keep the largest entry
3329  * at the root (array entry zero), instead of the smallest as in the normal
3330  * sort case.  This allows us to discard the largest entry cheaply.
3331  * Therefore, we temporarily reverse the sort direction.
3332  *
3333  * We assume that all entries in a bounded heap will always have tupindex
3334  * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
3335  * the direction of comparison for tupindexes.
3336  */
3337 static void
3338 make_bounded_heap(Tuplesortstate *state)
3339 {
3340         int                     tupcount = state->memtupcount;
3341         int                     i;
3342
3343         Assert(state->status == TSS_INITIAL);
3344         Assert(state->bounded);
3345         Assert(tupcount >= state->bound);
3346
3347         /* Reverse sort direction so largest entry will be at root */
3348         reversedirection(state);
3349
3350         state->memtupcount = 0;         /* make the heap empty */
3351         for (i = 0; i < tupcount; i++)
3352         {
3353                 if (state->memtupcount >= state->bound &&
3354                   COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3355                 {
3356                         /* New tuple would just get thrown out, so skip it */
3357                         free_sort_tuple(state, &state->memtuples[i]);
3358                         CHECK_FOR_INTERRUPTS();
3359                 }
3360                 else
3361                 {
3362                         /* Insert next tuple into heap */
3363                         /* Must copy source tuple to avoid possible overwrite */
3364                         SortTuple       stup = state->memtuples[i];
3365
3366                         tuplesort_heap_insert(state, &stup, 0, false);
3367
3368                         /* If heap too full, discard largest entry */
3369                         if (state->memtupcount > state->bound)
3370                         {
3371                                 free_sort_tuple(state, &state->memtuples[0]);
3372                                 tuplesort_heap_siftup(state, false);
3373                         }
3374                 }
3375         }
3376
3377         Assert(state->memtupcount == state->bound);
3378         state->status = TSS_BOUNDED;
3379 }
3380
3381 /*
3382  * Convert the bounded heap to a properly-sorted array
3383  */
3384 static void
3385 sort_bounded_heap(Tuplesortstate *state)
3386 {
3387         int                     tupcount = state->memtupcount;
3388
3389         Assert(state->status == TSS_BOUNDED);
3390         Assert(state->bounded);
3391         Assert(tupcount == state->bound);
3392
3393         /*
3394          * We can unheapify in place because each sift-up will remove the largest
3395          * entry, which we can promptly store in the newly freed slot at the end.
3396          * Once we're down to a single-entry heap, we're done.
3397          */
3398         while (state->memtupcount > 1)
3399         {
3400                 SortTuple       stup = state->memtuples[0];
3401
3402                 /* this sifts-up the next-largest entry and decreases memtupcount */
3403                 tuplesort_heap_siftup(state, false);
3404                 state->memtuples[state->memtupcount] = stup;
3405         }
3406         state->memtupcount = tupcount;
3407
3408         /*
3409          * Reverse sort direction back to the original state.  This is not
3410          * actually necessary but seems like a good idea for tidiness.
3411          */
3412         reversedirection(state);
3413
3414         state->status = TSS_SORTEDINMEM;
3415         state->boundUsed = true;
3416 }
3417
3418 /*
3419  * Insert a new tuple into an empty or existing heap, maintaining the
3420  * heap invariant.  Caller is responsible for ensuring there's room.
3421  *
3422  * Note: we assume *tuple is a temporary variable that can be scribbled on.
3423  * For some callers, tuple actually points to a memtuples[] entry above the
3424  * end of the heap.  This is safe as long as it's not immediately adjacent
3425  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3426  * is, it might get overwritten before being moved into the heap!
3427  */
3428 static void
3429 tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
3430                                           int tupleindex, bool checkIndex)
3431 {
3432         SortTuple  *memtuples;
3433         int                     j;
3434
3435         /*
3436          * Save the tupleindex --- see notes above about writing on *tuple. It's a
3437          * historical artifact that tupleindex is passed as a separate argument
3438          * and not in *tuple, but it's notationally convenient so let's leave it
3439          * that way.
3440          */
3441         tuple->tupindex = tupleindex;
3442
3443         memtuples = state->memtuples;
3444         Assert(state->memtupcount < state->memtupsize);
3445
3446         CHECK_FOR_INTERRUPTS();
3447
3448         /*
3449          * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3450          * using 1-based array indexes, not 0-based.
3451          */
3452         j = state->memtupcount++;
3453         while (j > 0)
3454         {
3455                 int                     i = (j - 1) >> 1;
3456
3457                 if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0)
3458                         break;
3459                 memtuples[j] = memtuples[i];
3460                 j = i;
3461         }
3462         memtuples[j] = *tuple;
3463 }
3464
3465 /*
3466  * The tuple at state->memtuples[0] has been removed from the heap.
3467  * Decrement memtupcount, and sift up to maintain the heap invariant.
3468  */
3469 static void
3470 tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
3471 {
3472         SortTuple  *memtuples = state->memtuples;
3473         SortTuple  *tuple;
3474         int                     i,
3475                                 n;
3476
3477         if (--state->memtupcount <= 0)
3478                 return;
3479
3480         CHECK_FOR_INTERRUPTS();
3481
3482         n = state->memtupcount;
3483         tuple = &memtuples[n];          /* tuple that must be reinserted */
3484         i = 0;                                          /* i is where the "hole" is */
3485         for (;;)
3486         {
3487                 int                     j = 2 * i + 1;
3488
3489                 if (j >= n)
3490                         break;
3491                 if (j + 1 < n &&
3492                         HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0)
3493                         j++;
3494                 if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0)
3495                         break;
3496                 memtuples[i] = memtuples[j];
3497                 i = j;
3498         }
3499         memtuples[i] = *tuple;
3500 }
3501
3502 /*
3503  * Function to reverse the sort direction from its current state
3504  *
3505  * It is not safe to call this when performing hash tuplesorts
3506  */
3507 static void
3508 reversedirection(Tuplesortstate *state)
3509 {
3510         SortSupport sortKey = state->sortKeys;
3511         int                     nkey;
3512
3513         for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3514         {
3515                 sortKey->ssup_reverse = !sortKey->ssup_reverse;
3516                 sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3517         }
3518 }
3519
3520
3521 /*
3522  * Tape interface routines
3523  */
3524
3525 static unsigned int
3526 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3527 {
3528         unsigned int len;
3529
3530         if (LogicalTapeRead(state->tapeset, tapenum,
3531                                                 &len, sizeof(len)) != sizeof(len))
3532                 elog(ERROR, "unexpected end of tape");
3533         if (len == 0 && !eofOK)
3534                 elog(ERROR, "unexpected end of data");
3535         return len;
3536 }
3537
3538 static void
3539 markrunend(Tuplesortstate *state, int tapenum)
3540 {
3541         unsigned int len = 0;
3542
3543         LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3544 }
3545
3546 /*
3547  * Get memory for tuple from within READTUP() routine.  Allocate
3548  * memory and account for that, or consume from tape's batch
3549  * allocation.
3550  *
3551  * Memory returned here in the final on-the-fly merge case is recycled
3552  * from tape's batch allocation.  Otherwise, callers must pfree() or
3553  * reset tuple child memory context, and account for that with a
3554  * FREEMEM().  Currently, this only ever needs to happen in WRITETUP()
3555  * routines.
3556  */
3557 static void *
3558 readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen)
3559 {
3560         if (state->batchUsed)
3561         {
3562                 /*
3563                  * No USEMEM() call, because during final on-the-fly merge
3564                  * accounting is based on tape-private state. ("Overflow"
3565                  * allocations are detected as an indication that a new round
3566                  * or preloading is required. Preloading marks existing
3567                  * contents of tape's batch buffer for reuse.)
3568                  */
3569                 return mergebatchalloc(state, tapenum, tuplen);
3570         }
3571         else
3572         {
3573                 char       *ret;
3574
3575                 /* Batch allocation yet to be performed */
3576                 ret = MemoryContextAlloc(state->tuplecontext, tuplen);
3577                 USEMEM(state, GetMemoryChunkSpace(ret));
3578                 return ret;
3579         }
3580 }
3581
3582
3583 /*
3584  * Routines specialized for HeapTuple (actually MinimalTuple) case
3585  */
3586
3587 static int
3588 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
3589 {
3590         SortSupport sortKey = state->sortKeys;
3591         HeapTupleData ltup;
3592         HeapTupleData rtup;
3593         TupleDesc       tupDesc;
3594         int                     nkey;
3595         int32           compare;
3596         AttrNumber      attno;
3597         Datum           datum1,
3598                                 datum2;
3599         bool            isnull1,
3600                                 isnull2;
3601
3602
3603         /* Compare the leading sort key */
3604         compare = ApplySortComparator(a->datum1, a->isnull1,
3605                                                                   b->datum1, b->isnull1,
3606                                                                   sortKey);
3607         if (compare != 0)
3608                 return compare;
3609
3610         /* Compare additional sort keys */
3611         ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3612         ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3613         rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3614         rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3615         tupDesc = state->tupDesc;
3616
3617         if (sortKey->abbrev_converter)
3618         {
3619                 attno = sortKey->ssup_attno;
3620
3621                 datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3622                 datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3623
3624                 compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3625                                                                                                 datum2, isnull2,
3626                                                                                                 sortKey);
3627                 if (compare != 0)
3628                         return compare;
3629         }
3630
3631         sortKey++;
3632         for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3633         {
3634                 attno = sortKey->ssup_attno;
3635
3636                 datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3637                 datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3638
3639                 compare = ApplySortComparator(datum1, isnull1,
3640                                                                           datum2, isnull2,
3641                                                                           sortKey);
3642                 if (compare != 0)
3643                         return compare;
3644         }
3645
3646         return 0;
3647 }
3648
3649 static void
3650 copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
3651 {
3652         /*
3653          * We expect the passed "tup" to be a TupleTableSlot, and form a
3654          * MinimalTuple using the exported interface for that.
3655          */
3656         TupleTableSlot *slot = (TupleTableSlot *) tup;
3657         Datum           original;
3658         MinimalTuple tuple;
3659         HeapTupleData htup;
3660         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3661
3662         /* copy the tuple into sort storage */
3663         tuple = ExecCopySlotMinimalTuple(slot);
3664         stup->tuple = (void *) tuple;
3665         USEMEM(state, GetMemoryChunkSpace(tuple));
3666         /* set up first-column key value */
3667         htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3668         htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3669         original = heap_getattr(&htup,
3670                                                         state->sortKeys[0].ssup_attno,
3671                                                         state->tupDesc,
3672                                                         &stup->isnull1);
3673
3674         MemoryContextSwitchTo(oldcontext);
3675
3676         if (!state->sortKeys->abbrev_converter || stup->isnull1)
3677         {
3678                 /*
3679                  * Store ordinary Datum representation, or NULL value.  If there is a
3680                  * converter it won't expect NULL values, and cost model is not
3681                  * required to account for NULL, so in that case we avoid calling
3682                  * converter and just set datum1 to zeroed representation (to be
3683                  * consistent, and to support cheap inequality tests for NULL
3684                  * abbreviated keys).
3685                  */
3686                 stup->datum1 = original;
3687         }
3688         else if (!consider_abort_common(state))
3689         {
3690                 /* Store abbreviated key representation */
3691                 stup->datum1 = state->sortKeys->abbrev_converter(original,
3692                                                                                                                  state->sortKeys);
3693         }
3694         else
3695         {
3696                 /* Abort abbreviation */
3697                 int                     i;
3698
3699                 stup->datum1 = original;
3700
3701                 /*
3702                  * Set state to be consistent with never trying abbreviation.
3703                  *
3704                  * Alter datum1 representation in already-copied tuples, so as to
3705                  * ensure a consistent representation (current tuple was just
3706                  * handled).  It does not matter if some dumped tuples are already
3707                  * sorted on tape, since serialized tuples lack abbreviated keys
3708                  * (TSS_BUILDRUNS state prevents control reaching here in any
3709                  * case).
3710                  */
3711                 for (i = 0; i < state->memtupcount; i++)
3712                 {
3713                         SortTuple  *mtup = &state->memtuples[i];
3714
3715                         htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3716                                 MINIMAL_TUPLE_OFFSET;
3717                         htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3718                                                                                          MINIMAL_TUPLE_OFFSET);
3719
3720                         mtup->datum1 = heap_getattr(&htup,
3721                                                                                 state->sortKeys[0].ssup_attno,
3722                                                                                 state->tupDesc,
3723                                                                                 &mtup->isnull1);
3724                 }
3725         }
3726 }
3727
3728 static void
3729 writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
3730 {
3731         MinimalTuple tuple = (MinimalTuple) stup->tuple;
3732
3733         /* the part of the MinimalTuple we'll write: */
3734         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3735         unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3736
3737         /* total on-disk footprint: */
3738         unsigned int tuplen = tupbodylen + sizeof(int);
3739
3740         LogicalTapeWrite(state->tapeset, tapenum,
3741                                          (void *) &tuplen, sizeof(tuplen));
3742         LogicalTapeWrite(state->tapeset, tapenum,
3743                                          (void *) tupbody, tupbodylen);
3744         if (state->randomAccess)        /* need trailing length word? */
3745                 LogicalTapeWrite(state->tapeset, tapenum,
3746                                                  (void *) &tuplen, sizeof(tuplen));
3747
3748         FREEMEM(state, GetMemoryChunkSpace(tuple));
3749         heap_free_minimal_tuple(tuple);
3750 }
3751
3752 static void
3753 readtup_heap(Tuplesortstate *state, SortTuple *stup,
3754                          int tapenum, unsigned int len)
3755 {
3756         unsigned int tupbodylen = len - sizeof(int);
3757         unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3758         MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tapenum, tuplen);
3759         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3760         HeapTupleData htup;
3761
3762         /* read in the tuple proper */
3763         tuple->t_len = tuplen;
3764         LogicalTapeReadExact(state->tapeset, tapenum,
3765                                                  tupbody, tupbodylen);
3766         if (state->randomAccess)        /* need trailing length word? */
3767                 LogicalTapeReadExact(state->tapeset, tapenum,
3768                                                          &tuplen, sizeof(tuplen));
3769         stup->tuple = (void *) tuple;
3770         /* set up first-column key value */
3771         htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3772         htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3773         stup->datum1 = heap_getattr(&htup,
3774                                                                 state->sortKeys[0].ssup_attno,
3775                                                                 state->tupDesc,
3776                                                                 &stup->isnull1);
3777 }
3778
3779 /*
3780  * Routines specialized for the CLUSTER case (HeapTuple data, with
3781  * comparisons per a btree index definition)
3782  */
3783
3784 static int
3785 comparetup_cluster(const SortTuple *a, const SortTuple *b,
3786                                    Tuplesortstate *state)
3787 {
3788         SortSupport sortKey = state->sortKeys;
3789         HeapTuple       ltup;
3790         HeapTuple       rtup;
3791         TupleDesc       tupDesc;
3792         int                     nkey;
3793         int32           compare;
3794         Datum           datum1,
3795                                 datum2;
3796         bool            isnull1,
3797                                 isnull2;
3798         AttrNumber      leading = state->indexInfo->ii_KeyAttrNumbers[0];
3799
3800         /* Be prepared to compare additional sort keys */
3801         ltup = (HeapTuple) a->tuple;
3802         rtup = (HeapTuple) b->tuple;
3803         tupDesc = state->tupDesc;
3804
3805         /* Compare the leading sort key, if it's simple */
3806         if (leading != 0)
3807         {
3808                 compare = ApplySortComparator(a->datum1, a->isnull1,
3809                                                                           b->datum1, b->isnull1,
3810                                                                           sortKey);
3811                 if (compare != 0)
3812                         return compare;
3813
3814                 if (sortKey->abbrev_converter)
3815                 {
3816                         datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3817                         datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3818
3819                         compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3820                                                                                                         datum2, isnull2,
3821                                                                                                         sortKey);
3822                 }
3823                 if (compare != 0 || state->nKeys == 1)
3824                         return compare;
3825                 /* Compare additional columns the hard way */
3826                 sortKey++;
3827                 nkey = 1;
3828         }
3829         else
3830         {
3831                 /* Must compare all keys the hard way */
3832                 nkey = 0;
3833         }
3834
3835         if (state->indexInfo->ii_Expressions == NULL)
3836         {
3837                 /* If not expression index, just compare the proper heap attrs */
3838
3839                 for (; nkey < state->nKeys; nkey++, sortKey++)
3840                 {
3841                         AttrNumber      attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
3842
3843                         datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3844                         datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3845
3846                         compare = ApplySortComparator(datum1, isnull1,
3847                                                                                   datum2, isnull2,
3848                                                                                   sortKey);
3849                         if (compare != 0)
3850                                 return compare;
3851                 }
3852         }
3853         else
3854         {
3855                 /*
3856                  * In the expression index case, compute the whole index tuple and
3857                  * then compare values.  It would perhaps be faster to compute only as
3858                  * many columns as we need to compare, but that would require
3859                  * duplicating all the logic in FormIndexDatum.
3860                  */
3861                 Datum           l_index_values[INDEX_MAX_KEYS];
3862                 bool            l_index_isnull[INDEX_MAX_KEYS];
3863                 Datum           r_index_values[INDEX_MAX_KEYS];
3864                 bool            r_index_isnull[INDEX_MAX_KEYS];
3865                 TupleTableSlot *ecxt_scantuple;
3866
3867                 /* Reset context each time to prevent memory leakage */
3868                 ResetPerTupleExprContext(state->estate);
3869
3870                 ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
3871
3872                 ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
3873                 FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3874                                            l_index_values, l_index_isnull);
3875
3876                 ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
3877                 FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3878                                            r_index_values, r_index_isnull);
3879
3880                 for (; nkey < state->nKeys; nkey++, sortKey++)
3881                 {
3882                         compare = ApplySortComparator(l_index_values[nkey],
3883                                                                                   l_index_isnull[nkey],
3884                                                                                   r_index_values[nkey],
3885                                                                                   r_index_isnull[nkey],
3886                                                                                   sortKey);
3887                         if (compare != 0)
3888                                 return compare;
3889                 }
3890         }
3891
3892         return 0;
3893 }
3894
3895 static void
3896 copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
3897 {
3898         HeapTuple       tuple = (HeapTuple) tup;
3899         Datum           original;
3900         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3901
3902         /* copy the tuple into sort storage */
3903         tuple = heap_copytuple(tuple);
3904         stup->tuple = (void *) tuple;
3905         USEMEM(state, GetMemoryChunkSpace(tuple));
3906
3907         MemoryContextSwitchTo(oldcontext);
3908
3909         /*
3910          * set up first-column key value, and potentially abbreviate, if it's a
3911          * simple column
3912          */
3913         if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
3914                 return;
3915
3916         original = heap_getattr(tuple,
3917                                                         state->indexInfo->ii_KeyAttrNumbers[0],
3918                                                         state->tupDesc,
3919                                                         &stup->isnull1);
3920
3921         if (!state->sortKeys->abbrev_converter || stup->isnull1)
3922         {
3923                 /*
3924                  * Store ordinary Datum representation, or NULL value.  If there is a
3925                  * converter it won't expect NULL values, and cost model is not
3926                  * required to account for NULL, so in that case we avoid calling
3927                  * converter and just set datum1 to zeroed representation (to be
3928                  * consistent, and to support cheap inequality tests for NULL
3929                  * abbreviated keys).
3930                  */
3931                 stup->datum1 = original;
3932         }
3933         else if (!consider_abort_common(state))
3934         {
3935                 /* Store abbreviated key representation */
3936                 stup->datum1 = state->sortKeys->abbrev_converter(original,
3937                                                                                                                  state->sortKeys);
3938         }
3939         else
3940         {
3941                 /* Abort abbreviation */
3942                 int                     i;
3943
3944                 stup->datum1 = original;
3945
3946                 /*
3947                  * Set state to be consistent with never trying abbreviation.
3948                  *
3949                  * Alter datum1 representation in already-copied tuples, so as to
3950                  * ensure a consistent representation (current tuple was just
3951                  * handled).  It does not matter if some dumped tuples are already
3952                  * sorted on tape, since serialized tuples lack abbreviated keys
3953                  * (TSS_BUILDRUNS state prevents control reaching here in any
3954                  * case).
3955                  */
3956                 for (i = 0; i < state->memtupcount; i++)
3957                 {
3958                         SortTuple  *mtup = &state->memtuples[i];
3959
3960                         tuple = (HeapTuple) mtup->tuple;
3961                         mtup->datum1 = heap_getattr(tuple,
3962                                                                           state->indexInfo->ii_KeyAttrNumbers[0],
3963                                                                                 state->tupDesc,
3964                                                                                 &stup->isnull1);
3965                 }
3966         }
3967 }
3968
3969 static void
3970 writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
3971 {
3972         HeapTuple       tuple = (HeapTuple) stup->tuple;
3973         unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
3974
3975         /* We need to store t_self, but not other fields of HeapTupleData */
3976         LogicalTapeWrite(state->tapeset, tapenum,
3977                                          &tuplen, sizeof(tuplen));
3978         LogicalTapeWrite(state->tapeset, tapenum,
3979                                          &tuple->t_self, sizeof(ItemPointerData));
3980         LogicalTapeWrite(state->tapeset, tapenum,
3981                                          tuple->t_data, tuple->t_len);
3982         if (state->randomAccess)        /* need trailing length word? */
3983                 LogicalTapeWrite(state->tapeset, tapenum,
3984                                                  &tuplen, sizeof(tuplen));
3985
3986         FREEMEM(state, GetMemoryChunkSpace(tuple));
3987         heap_freetuple(tuple);
3988 }
3989
3990 static void
3991 readtup_cluster(Tuplesortstate *state, SortTuple *stup,
3992                                 int tapenum, unsigned int tuplen)
3993 {
3994         unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
3995         HeapTuple       tuple = (HeapTuple) readtup_alloc(state,
3996                                                                                                   tapenum,
3997                                                                                                   t_len + HEAPTUPLESIZE);
3998
3999         /* Reconstruct the HeapTupleData header */
4000         tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4001         tuple->t_len = t_len;
4002         LogicalTapeReadExact(state->tapeset, tapenum,
4003                                                  &tuple->t_self, sizeof(ItemPointerData));
4004         /* We don't currently bother to reconstruct t_tableOid */
4005         tuple->t_tableOid = InvalidOid;
4006         /* Read in the tuple body */
4007         LogicalTapeReadExact(state->tapeset, tapenum,
4008                                                  tuple->t_data, tuple->t_len);
4009         if (state->randomAccess)        /* need trailing length word? */
4010                 LogicalTapeReadExact(state->tapeset, tapenum,
4011                                                          &tuplen, sizeof(tuplen));
4012         stup->tuple = (void *) tuple;
4013         /* set up first-column key value, if it's a simple column */
4014         if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
4015                 stup->datum1 = heap_getattr(tuple,
4016                                                                         state->indexInfo->ii_KeyAttrNumbers[0],
4017                                                                         state->tupDesc,
4018                                                                         &stup->isnull1);
4019 }
4020
4021
4022 /*
4023  * Routines specialized for IndexTuple case
4024  *
4025  * The btree and hash cases require separate comparison functions, but the
4026  * IndexTuple representation is the same so the copy/write/read support
4027  * functions can be shared.
4028  */
4029
4030 static int
4031 comparetup_index_btree(const SortTuple *a, const SortTuple *b,
4032                                            Tuplesortstate *state)
4033 {
4034         /*
4035          * This is similar to comparetup_heap(), but expects index tuples.  There
4036          * is also special handling for enforcing uniqueness, and special
4037          * treatment for equal keys at the end.
4038          */
4039         SortSupport sortKey = state->sortKeys;
4040         IndexTuple      tuple1;
4041         IndexTuple      tuple2;
4042         int                     keysz;
4043         TupleDesc       tupDes;
4044         bool            equal_hasnull = false;
4045         int                     nkey;
4046         int32           compare;
4047         Datum           datum1,
4048                                 datum2;
4049         bool            isnull1,
4050                                 isnull2;
4051
4052
4053         /* Compare the leading sort key */
4054         compare = ApplySortComparator(a->datum1, a->isnull1,
4055                                                                   b->datum1, b->isnull1,
4056                                                                   sortKey);
4057         if (compare != 0)
4058                 return compare;
4059
4060         /* Compare additional sort keys */
4061         tuple1 = (IndexTuple) a->tuple;
4062         tuple2 = (IndexTuple) b->tuple;
4063         keysz = state->nKeys;
4064         tupDes = RelationGetDescr(state->indexRel);
4065
4066         if (sortKey->abbrev_converter)
4067         {
4068                 datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4069                 datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4070
4071                 compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4072                                                                                                 datum2, isnull2,
4073                                                                                                 sortKey);
4074                 if (compare != 0)
4075                         return compare;
4076         }
4077
4078         /* they are equal, so we only need to examine one null flag */
4079         if (a->isnull1)
4080                 equal_hasnull = true;
4081
4082         sortKey++;
4083         for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4084         {
4085                 datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4086                 datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4087
4088                 compare = ApplySortComparator(datum1, isnull1,
4089                                                                           datum2, isnull2,
4090                                                                           sortKey);
4091                 if (compare != 0)
4092                         return compare;         /* done when we find unequal attributes */
4093
4094                 /* they are equal, so we only need to examine one null flag */
4095                 if (isnull1)
4096                         equal_hasnull = true;
4097         }
4098
4099         /*
4100          * If btree has asked us to enforce uniqueness, complain if two equal
4101          * tuples are detected (unless there was at least one NULL field).
4102          *
4103          * It is sufficient to make the test here, because if two tuples are equal
4104          * they *must* get compared at some stage of the sort --- otherwise the
4105          * sort algorithm wouldn't have checked whether one must appear before the
4106          * other.
4107          */
4108         if (state->enforceUnique && !equal_hasnull)
4109         {
4110                 Datum           values[INDEX_MAX_KEYS];
4111                 bool            isnull[INDEX_MAX_KEYS];
4112                 char       *key_desc;
4113
4114                 /*
4115                  * Some rather brain-dead implementations of qsort (such as the one in
4116                  * QNX 4) will sometimes call the comparison routine to compare a
4117                  * value to itself, but we always use our own implementation, which
4118                  * does not.
4119                  */
4120                 Assert(tuple1 != tuple2);
4121
4122                 index_deform_tuple(tuple1, tupDes, values, isnull);
4123
4124                 key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4125
4126                 ereport(ERROR,
4127                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
4128                                  errmsg("could not create unique index \"%s\"",
4129                                                 RelationGetRelationName(state->indexRel)),
4130                                  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4131                                  errdetail("Duplicate keys exist."),
4132                                  errtableconstraint(state->heapRel,
4133                                                                  RelationGetRelationName(state->indexRel))));
4134         }
4135
4136         /*
4137          * If key values are equal, we sort on ItemPointer.  This does not affect
4138          * validity of the finished index, but it may be useful to have index
4139          * scans in physical order.
4140          */
4141         {
4142                 BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4143                 BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4144
4145                 if (blk1 != blk2)
4146                         return (blk1 < blk2) ? -1 : 1;
4147         }
4148         {
4149                 OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4150                 OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4151
4152                 if (pos1 != pos2)
4153                         return (pos1 < pos2) ? -1 : 1;
4154         }
4155
4156         return 0;
4157 }
4158
4159 static int
4160 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
4161                                           Tuplesortstate *state)
4162 {
4163         uint32          hash1;
4164         uint32          hash2;
4165         IndexTuple      tuple1;
4166         IndexTuple      tuple2;
4167
4168         /*
4169          * Fetch hash keys and mask off bits we don't want to sort by. We know
4170          * that the first column of the index tuple is the hash key.
4171          */
4172         Assert(!a->isnull1);
4173         hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
4174         Assert(!b->isnull1);
4175         hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
4176
4177         if (hash1 > hash2)
4178                 return 1;
4179         else if (hash1 < hash2)
4180                 return -1;
4181
4182         /*
4183          * If hash values are equal, we sort on ItemPointer.  This does not affect
4184          * validity of the finished index, but it may be useful to have index
4185          * scans in physical order.
4186          */
4187         tuple1 = (IndexTuple) a->tuple;
4188         tuple2 = (IndexTuple) b->tuple;
4189
4190         {
4191                 BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4192                 BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4193
4194                 if (blk1 != blk2)
4195                         return (blk1 < blk2) ? -1 : 1;
4196         }
4197         {
4198                 OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4199                 OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4200
4201                 if (pos1 != pos2)
4202                         return (pos1 < pos2) ? -1 : 1;
4203         }
4204
4205         return 0;
4206 }
4207
4208 static void
4209 copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
4210 {
4211         IndexTuple      tuple = (IndexTuple) tup;
4212         unsigned int tuplen = IndexTupleSize(tuple);
4213         IndexTuple      newtuple;
4214         Datum           original;
4215
4216         /* copy the tuple into sort storage */
4217         newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen);
4218         memcpy(newtuple, tuple, tuplen);
4219         USEMEM(state, GetMemoryChunkSpace(newtuple));
4220         stup->tuple = (void *) newtuple;
4221         /* set up first-column key value */
4222         original = index_getattr(newtuple,
4223                                                          1,
4224                                                          RelationGetDescr(state->indexRel),
4225                                                          &stup->isnull1);
4226
4227         if (!state->sortKeys->abbrev_converter || stup->isnull1)
4228         {
4229                 /*
4230                  * Store ordinary Datum representation, or NULL value.  If there is a
4231                  * converter it won't expect NULL values, and cost model is not
4232                  * required to account for NULL, so in that case we avoid calling
4233                  * converter and just set datum1 to zeroed representation (to be
4234                  * consistent, and to support cheap inequality tests for NULL
4235                  * abbreviated keys).
4236                  */
4237                 stup->datum1 = original;
4238         }
4239         else if (!consider_abort_common(state))
4240         {
4241                 /* Store abbreviated key representation */
4242                 stup->datum1 = state->sortKeys->abbrev_converter(original,
4243                                                                                                                  state->sortKeys);
4244         }
4245         else
4246         {
4247                 /* Abort abbreviation */
4248                 int                     i;
4249
4250                 stup->datum1 = original;
4251
4252                 /*
4253                  * Set state to be consistent with never trying abbreviation.
4254                  *
4255                  * Alter datum1 representation in already-copied tuples, so as to
4256                  * ensure a consistent representation (current tuple was just
4257                  * handled).  It does not matter if some dumped tuples are already
4258                  * sorted on tape, since serialized tuples lack abbreviated keys
4259                  * (TSS_BUILDRUNS state prevents control reaching here in any
4260                  * case).
4261                  */
4262                 for (i = 0; i < state->memtupcount; i++)
4263                 {
4264                         SortTuple  *mtup = &state->memtuples[i];
4265
4266                         tuple = (IndexTuple) mtup->tuple;
4267                         mtup->datum1 = index_getattr(tuple,
4268                                                                                  1,
4269                                                                                  RelationGetDescr(state->indexRel),
4270                                                                                  &stup->isnull1);
4271                 }
4272         }
4273 }
4274
4275 static void
4276 writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
4277 {
4278         IndexTuple      tuple = (IndexTuple) stup->tuple;
4279         unsigned int tuplen;
4280
4281         tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4282         LogicalTapeWrite(state->tapeset, tapenum,
4283                                          (void *) &tuplen, sizeof(tuplen));
4284         LogicalTapeWrite(state->tapeset, tapenum,
4285                                          (void *) tuple, IndexTupleSize(tuple));
4286         if (state->randomAccess)        /* need trailing length word? */
4287                 LogicalTapeWrite(state->tapeset, tapenum,
4288                                                  (void *) &tuplen, sizeof(tuplen));
4289
4290         FREEMEM(state, GetMemoryChunkSpace(tuple));
4291         pfree(tuple);
4292 }
4293
4294 static void
4295 readtup_index(Tuplesortstate *state, SortTuple *stup,
4296                           int tapenum, unsigned int len)
4297 {
4298         unsigned int tuplen = len - sizeof(unsigned int);
4299         IndexTuple      tuple = (IndexTuple) readtup_alloc(state, tapenum, tuplen);
4300
4301         LogicalTapeReadExact(state->tapeset, tapenum,
4302                                                  tuple, tuplen);
4303         if (state->randomAccess)        /* need trailing length word? */
4304                 LogicalTapeReadExact(state->tapeset, tapenum,
4305                                                          &tuplen, sizeof(tuplen));
4306         stup->tuple = (void *) tuple;
4307         /* set up first-column key value */
4308         stup->datum1 = index_getattr(tuple,
4309                                                                  1,
4310                                                                  RelationGetDescr(state->indexRel),
4311                                                                  &stup->isnull1);
4312 }
4313
4314 /*
4315  * Routines specialized for DatumTuple case
4316  */
4317
4318 static int
4319 comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
4320 {
4321         int                     compare;
4322
4323         compare = ApplySortComparator(a->datum1, a->isnull1,
4324                                                                   b->datum1, b->isnull1,
4325                                                                   state->sortKeys);
4326         if (compare != 0)
4327                 return compare;
4328
4329         /* if we have abbreviations, then "tuple" has the original value */
4330
4331         if (state->sortKeys->abbrev_converter)
4332                 compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1,
4333                                                                            PointerGetDatum(b->tuple), b->isnull1,
4334                                                                                                 state->sortKeys);
4335
4336         return compare;
4337 }
4338
4339 static void
4340 copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
4341 {
4342         /* Not currently needed */
4343         elog(ERROR, "copytup_datum() should not be called");
4344 }
4345
4346 static void
4347 writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
4348 {
4349         void       *waddr;
4350         unsigned int tuplen;
4351         unsigned int writtenlen;
4352
4353         if (stup->isnull1)
4354         {
4355                 waddr = NULL;
4356                 tuplen = 0;
4357         }
4358         else if (!state->tuples)
4359         {
4360                 waddr = &stup->datum1;
4361                 tuplen = sizeof(Datum);
4362         }
4363         else
4364         {
4365                 waddr = stup->tuple;
4366                 tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4367                 Assert(tuplen != 0);
4368         }
4369
4370         writtenlen = tuplen + sizeof(unsigned int);
4371
4372         LogicalTapeWrite(state->tapeset, tapenum,
4373                                          (void *) &writtenlen, sizeof(writtenlen));
4374         LogicalTapeWrite(state->tapeset, tapenum,
4375                                          waddr, tuplen);
4376         if (state->randomAccess)        /* need trailing length word? */
4377                 LogicalTapeWrite(state->tapeset, tapenum,
4378                                                  (void *) &writtenlen, sizeof(writtenlen));
4379
4380         if (stup->tuple)
4381         {
4382                 FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4383                 pfree(stup->tuple);
4384         }
4385 }
4386
4387 static void
4388 readtup_datum(Tuplesortstate *state, SortTuple *stup,
4389                           int tapenum, unsigned int len)
4390 {
4391         unsigned int tuplen = len - sizeof(unsigned int);
4392
4393         if (tuplen == 0)
4394         {
4395                 /* it's NULL */
4396                 stup->datum1 = (Datum) 0;
4397                 stup->isnull1 = true;
4398                 stup->tuple = NULL;
4399         }
4400         else if (!state->tuples)
4401         {
4402                 Assert(tuplen == sizeof(Datum));
4403                 LogicalTapeReadExact(state->tapeset, tapenum,
4404                                                          &stup->datum1, tuplen);
4405                 stup->isnull1 = false;
4406                 stup->tuple = NULL;
4407         }
4408         else
4409         {
4410                 void       *raddr = readtup_alloc(state, tapenum, tuplen);
4411
4412                 LogicalTapeReadExact(state->tapeset, tapenum,
4413                                                          raddr, tuplen);
4414                 stup->datum1 = PointerGetDatum(raddr);
4415                 stup->isnull1 = false;
4416                 stup->tuple = raddr;
4417         }
4418
4419         if (state->randomAccess)        /* need trailing length word? */
4420                 LogicalTapeReadExact(state->tapeset, tapenum,
4421                                                          &tuplen, sizeof(tuplen));
4422 }
4423
4424 /*
4425  * Convenience routine to free a tuple previously loaded into sort memory
4426  */
4427 static void
4428 free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
4429 {
4430         FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4431         pfree(stup->tuple);
4432 }