granicus.if.org Git - postgresql/blob - src/backend/utils/sort/tuplesort.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * tuplesort.c
   4  *        Generalized tuple sorting routines.
   5  *
   6  * This module handles sorting of heap tuples, index tuples, or single
   7  * Datums (and could easily support other kinds of sortable objects,
   8  * if necessary).  It works efficiently for both small and large amounts
   9  * of data.  Small amounts are sorted in-memory using qsort().  Large
  10  * amounts are sorted using temporary files and a standard external sort
  11  * algorithm.
  12  *
  13  * See Knuth, volume 3, for more than you want to know about the external
  14  * sorting algorithm.  Historically, we divided the input into sorted runs
  15  * using replacement selection, in the form of a priority tree implemented
  16  * as a heap (essentially his Algorithm 5.2.3H), but now we always use
  17  * quicksort for run generation.  We merge the runs using polyphase merge,
  18  * Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D are
  19  * implemented by logtape.c, which avoids space wastage by recycling disk
  20  * space as soon as each block is read from its "tape".
  21  *
  22  * The approximate amount of memory allowed for any one sort operation
  23  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
  24  * we absorb tuples and simply store them in an unsorted array as long as
  25  * we haven't exceeded workMem.  If we reach the end of the input without
  26  * exceeding workMem, we sort the array using qsort() and subsequently return
  27  * tuples just by scanning the tuple array sequentially.  If we do exceed
  28  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
  29  * When tuples are dumped in batch after quicksorting, we begin a new run
  30  * with a new output tape (selected per Algorithm D).  After the end of the
  31  * input is reached, we dump out remaining tuples in memory into a final run,
  32  * then merge the runs using Algorithm D.
  33  *
  34  * When merging runs, we use a heap containing just the frontmost tuple from
  35  * each source run; we repeatedly output the smallest tuple and replace it
  36  * with the next tuple from its source tape (if any).  When the heap empties,
  37  * the merge is complete.  The basic merge algorithm thus needs very little
  38  * memory --- only M tuples for an M-way merge, and M is constrained to a
  39  * small number.  However, we can still make good use of our full workMem
  40  * allocation by pre-reading additional blocks from each source tape.  Without
  41  * prereading, our access pattern to the temporary file would be very erratic;
  42  * on average we'd read one block from each of M source tapes during the same
  43  * time that we're writing M blocks to the output tape, so there is no
  44  * sequentiality of access at all, defeating the read-ahead methods used by
  45  * most Unix kernels.  Worse, the output tape gets written into a very random
  46  * sequence of blocks of the temp file, ensuring that things will be even
  47  * worse when it comes time to read that tape.  A straightforward merge pass
  48  * thus ends up doing a lot of waiting for disk seeks.  We can improve matters
  49  * by prereading from each source tape sequentially, loading about workMem/M
  50  * bytes from each tape in turn, and making the sequential blocks immediately
  51  * available for reuse.  This approach helps to localize both read and write
  52  * accesses.  The pre-reading is handled by logtape.c, we just tell it how
  53  * much memory to use for the buffers.
  54  *
  55  * When the caller requests random access to the sort result, we form
  56  * the final sorted run on a logical tape which is then "frozen", so
  57  * that we can access it randomly.  When the caller does not need random
  58  * access, we return from tuplesort_performsort() as soon as we are down
  59  * to one run per logical tape.  The final merge is then performed
  60  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
  61  * saves one cycle of writing all the data out to disk and reading it in.
  62  *
  63  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
  64  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
  65  * to Knuth's figure 70 (section 5.4.2).  However, Knuth is assuming that
  66  * tape drives are expensive beasts, and in particular that there will always
  67  * be many more runs than tape drives.  In our implementation a "tape drive"
  68  * doesn't cost much more than a few Kb of memory buffers, so we can afford
  69  * to have lots of them.  In particular, if we can have as many tape drives
  70  * as sorted runs, we can eliminate any repeated I/O at all.  In the current
  71  * code we determine the number of tapes M on the basis of workMem: we want
  72  * workMem/M to be large enough that we read a fair amount of data each time
  73  * we preread from a tape, so as to maintain the locality of access described
  74  * above.  Nonetheless, with large workMem we can have many tapes (but not
  75  * too many -- see the comments in tuplesort_merge_order).
  76  *
  77  *
  78  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  79  * Portions Copyright (c) 1994, Regents of the University of California
  80  *
  81  * IDENTIFICATION
  82  *        src/backend/utils/sort/tuplesort.c
  83  *
  84  *-------------------------------------------------------------------------
  85  */
  86
  87 #include "postgres.h"
  88
  89 #include <limits.h>
  90
  91 #include "access/htup_details.h"
  92 #include "access/nbtree.h"
  93 #include "access/hash.h"
  94 #include "catalog/index.h"
  95 #include "catalog/pg_am.h"
  96 #include "commands/tablespace.h"
  97 #include "executor/executor.h"
  98 #include "miscadmin.h"
  99 #include "pg_trace.h"
 100 #include "utils/datum.h"
 101 #include "utils/logtape.h"
 102 #include "utils/lsyscache.h"
 103 #include "utils/memutils.h"
 104 #include "utils/pg_rusage.h"
 105 #include "utils/rel.h"
 106 #include "utils/sortsupport.h"
 107 #include "utils/tuplesort.h"
 108
 109
 110 /* sort-type codes for sort__start probes */
 111 #define HEAP_SORT               0
 112 #define INDEX_SORT              1
 113 #define DATUM_SORT              2
 114 #define CLUSTER_SORT    3
 115
 116 /* GUC variables */
 117 #ifdef TRACE_SORT
 118 bool            trace_sort = false;
 119 #endif
 120
 121 #ifdef DEBUG_BOUNDED_SORT
 122 bool            optimize_bounded_sort = true;
 123 #endif
 124
 125
 126 /*
 127  * The objects we actually sort are SortTuple structs.  These contain
 128  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
 129  * which is a separate palloc chunk --- we assume it is just one chunk and
 130  * can be freed by a simple pfree() (except during merge, when we use a
 131  * simple slab allocator).  SortTuples also contain the tuple's first key
 132  * column in Datum/nullflag format, and an index integer.
 133  *
 134  * Storing the first key column lets us save heap_getattr or index_getattr
 135  * calls during tuple comparisons.  We could extract and save all the key
 136  * columns not just the first, but this would increase code complexity and
 137  * overhead, and wouldn't actually save any comparison cycles in the common
 138  * case where the first key determines the comparison result.  Note that
 139  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
 140  *
 141  * There is one special case: when the sort support infrastructure provides an
 142  * "abbreviated key" representation, where the key is (typically) a pass by
 143  * value proxy for a pass by reference type.  In this case, the abbreviated key
 144  * is stored in datum1 in place of the actual first key column.
 145  *
 146  * When sorting single Datums, the data value is represented directly by
 147  * datum1/isnull1 for pass by value types (or null values).  If the datatype is
 148  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
 149  * palloc'd data value, otherwise "tuple" is NULL.  The value of datum1 is then
 150  * either the same pointer as "tuple", or is an abbreviated key value as
 151  * described above.  Accordingly, "tuple" is always used in preference to
 152  * datum1 as the authoritative value for pass-by-reference cases.
 153  *
 154  * tupindex holds the input tape number that each tuple in the heap was read
 155  * from during merge passes.
 156  */
 157 typedef struct
 158 {
 159         void       *tuple;                      /* the tuple itself */
 160         Datum           datum1;                 /* value of first key column */
 161         bool            isnull1;                /* is first key column NULL? */
 162         int                     tupindex;               /* see notes above */
 163 } SortTuple;
 164
 165 /*
 166  * During merge, we use a pre-allocated set of fixed-size slots to hold
 167  * tuples.  To avoid palloc/pfree overhead.
 168  *
 169  * Merge doesn't require a lot of memory, so we can afford to waste some,
 170  * by using gratuitously-sized slots.  If a tuple is larger than 1 kB, the
 171  * palloc() overhead is not significant anymore.
 172  *
 173  * 'nextfree' is valid when this chunk is in the free list.  When in use, the
 174  * slot holds a tuple.
 175  */
 176 #define SLAB_SLOT_SIZE 1024
 177
 178 typedef union SlabSlot
 179 {
 180         union SlabSlot *nextfree;
 181         char            buffer[SLAB_SLOT_SIZE];
 182 } SlabSlot;
 183
 184 /*
 185  * Possible states of a Tuplesort object.  These denote the states that
 186  * persist between calls of Tuplesort routines.
 187  */
 188 typedef enum
 189 {
 190         TSS_INITIAL,                            /* Loading tuples; still within memory limit */
 191         TSS_BOUNDED,                            /* Loading tuples into bounded-size heap */
 192         TSS_BUILDRUNS,                          /* Loading tuples; writing to tape */
 193         TSS_SORTEDINMEM,                        /* Sort completed entirely in memory */
 194         TSS_SORTEDONTAPE,                       /* Sort completed, final run is on tape */
 195         TSS_FINALMERGE                          /* Performing final merge on-the-fly */
 196 } TupSortStatus;
 197
 198 /*
 199  * Parameters for calculation of number of tapes to use --- see inittapes()
 200  * and tuplesort_merge_order().
 201  *
 202  * In this calculation we assume that each tape will cost us about 1 blocks
 203  * worth of buffer space.  This ignores the overhead of all the other data
 204  * structures needed for each tape, but it's probably close enough.
 205  *
 206  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
 207  * tape during a preread cycle (see discussion at top of file).
 208  */
 209 #define MINORDER                6               /* minimum merge order */
 210 #define MAXORDER                500             /* maximum merge order */
 211 #define TAPE_BUFFER_OVERHEAD            BLCKSZ
 212 #define MERGE_BUFFER_SIZE                       (BLCKSZ * 32)
 213
 214 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 215                                                                         Tuplesortstate *state);
 216
 217 /*
 218  * Private state of a Tuplesort operation.
 219  */
 220 struct Tuplesortstate
 221 {
 222         TupSortStatus status;           /* enumerated value as shown above */
 223         int                     nKeys;                  /* number of columns in sort key */
 224         bool            randomAccess;   /* did caller request random access? */
 225         bool            bounded;                /* did caller specify a maximum number of
 226                                                                  * tuples to return? */
 227         bool            boundUsed;              /* true if we made use of a bounded heap */
 228         int                     bound;                  /* if bounded, the maximum number of tuples */
 229         bool            tuples;                 /* Can SortTuple.tuple ever be set? */
 230         int64           availMem;               /* remaining memory available, in bytes */
 231         int64           allowedMem;             /* total memory allowed, in bytes */
 232         int                     maxTapes;               /* number of tapes (Knuth's T) */
 233         int                     tapeRange;              /* maxTapes-1 (Knuth's P) */
 234         MemoryContext sortcontext;      /* memory context holding most sort data */
 235         MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 236         LogicalTapeSet *tapeset;        /* logtape.c object for tapes in a temp file */
 237
 238         /*
 239          * These function pointers decouple the routines that must know what kind
 240          * of tuple we are sorting from the routines that don't need to know it.
 241          * They are set up by the tuplesort_begin_xxx routines.
 242          *
 243          * Function to compare two tuples; result is per qsort() convention, ie:
 244          * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 245          * qsort_arg_comparator.
 246          */
 247         SortTupleComparator comparetup;
 248
 249         /*
 250          * Function to copy a supplied input tuple into palloc'd space and set up
 251          * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 252          * state->availMem must be decreased by the amount of space used for the
 253          * tuple copy (note the SortTuple struct itself is not counted).
 254          */
 255         void            (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 256
 257         /*
 258          * Function to write a stored tuple onto tape.  The representation of the
 259          * tuple on tape need not be the same as it is in memory; requirements on
 260          * the tape representation are given below.  Unless the slab allocator is
 261          * used, after writing the tuple, pfree() the out-of-line data (not the
 262          * SortTuple struct!), and increase state->availMem by the amount of
 263          * memory space thereby released.
 264          */
 265         void            (*writetup) (Tuplesortstate *state, int tapenum,
 266                                                          SortTuple *stup);
 267
 268         /*
 269          * Function to read a stored tuple from tape back into memory. 'len' is
 270          * the already-read length of the stored tuple.  The tuple is allocated
 271          * from the slab memory arena, or is palloc'd, see readtup_alloc().
 272          */
 273         void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
 274                                                         int tapenum, unsigned int len);
 275
 276         /*
 277          * This array holds the tuples now in sort memory.  If we are in state
 278          * INITIAL, the tuples are in no particular order; if we are in state
 279          * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
 280          * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
 281          * H.  In state SORTEDONTAPE, the array is not used.
 282          */
 283         SortTuple  *memtuples;          /* array of SortTuple structs */
 284         int                     memtupcount;    /* number of tuples currently present */
 285         int                     memtupsize;             /* allocated length of memtuples array */
 286         bool            growmemtuples;  /* memtuples' growth still underway? */
 287
 288         /*
 289          * Memory for tuples is sometimes allocated using a simple slab allocator,
 290          * rather than with palloc().  Currently, we switch to slab allocation
 291          * when we start merging.  Merging only needs to keep a small, fixed
 292          * number of tuples in memory at any time, so we can avoid the
 293          * palloc/pfree overhead by recycling a fixed number of fixed-size slots
 294          * to hold the tuples.
 295          *
 296          * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
 297          * slots.  The allocation is sized to have one slot per tape, plus one
 298          * additional slot.  We need that many slots to hold all the tuples kept
 299          * in the heap during merge, plus the one we have last returned from the
 300          * sort, with tuplesort_gettuple.
 301          *
 302          * Initially, all the slots are kept in a linked list of free slots.  When
 303          * a tuple is read from a tape, it is put to the next available slot, if
 304          * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
 305          * instead.
 306          *
 307          * When we're done processing a tuple, we return the slot back to the free
 308          * list, or pfree() if it was palloc'd.  We know that a tuple was
 309          * allocated from the slab, if its pointer value is between
 310          * slabMemoryBegin and -End.
 311          *
 312          * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
 313          * tracking memory usage is not used.
 314          */
 315         bool            slabAllocatorUsed;
 316
 317         char       *slabMemoryBegin;    /* beginning of slab memory arena */
 318         char       *slabMemoryEnd;      /* end of slab memory arena */
 319         SlabSlot   *slabFreeHead;       /* head of free list */
 320
 321         /* Buffer size to use for reading input tapes, during merge. */
 322         size_t          read_buffer_size;
 323
 324         /*
 325          * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
 326          * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
 327          * modes), we remember the tuple in 'lastReturnedTuple', so that we can
 328          * recycle the memory on next gettuple call.
 329          */
 330         void       *lastReturnedTuple;
 331
 332         /*
 333          * While building initial runs, this is the current output run number.
 334          * Afterwards, it is the number of initial runs we made.
 335          */
 336         int                     currentRun;
 337
 338         /*
 339          * Unless otherwise noted, all pointer variables below are pointers to
 340          * arrays of length maxTapes, holding per-tape data.
 341          */
 342
 343         /*
 344          * This variable is only used during merge passes.  mergeactive[i] is true
 345          * if we are reading an input run from (actual) tape number i and have not
 346          * yet exhausted that run.
 347          */
 348         bool       *mergeactive;        /* active input run source? */
 349
 350         /*
 351          * Variables for Algorithm D.  Note that destTape is a "logical" tape
 352          * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
 353          * "logical" and "actual" tape numbers straight!
 354          */
 355         int                     Level;                  /* Knuth's l */
 356         int                     destTape;               /* current output tape (Knuth's j, less 1) */
 357         int                *tp_fib;                     /* Target Fibonacci run counts (A[]) */
 358         int                *tp_runs;            /* # of real runs on each tape */
 359         int                *tp_dummy;           /* # of dummy runs for each tape (D[]) */
 360         int                *tp_tapenum;         /* Actual tape numbers (TAPE[]) */
 361         int                     activeTapes;    /* # of active input tapes in merge pass */
 362
 363         /*
 364          * These variables are used after completion of sorting to keep track of
 365          * the next tuple to return.  (In the tape case, the tape's current read
 366          * position is also critical state.)
 367          */
 368         int                     result_tape;    /* actual tape number of finished output */
 369         int                     current;                /* array index (only used if SORTEDINMEM) */
 370         bool            eof_reached;    /* reached EOF (needed for cursors) */
 371
 372         /* markpos_xxx holds marked position for mark and restore */
 373         long            markpos_block;  /* tape block# (only used if SORTEDONTAPE) */
 374         int                     markpos_offset; /* saved "current", or offset in tape block */
 375         bool            markpos_eof;    /* saved "eof_reached" */
 376
 377         /*
 378          * The sortKeys variable is used by every case other than the hash index
 379          * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
 380          * MinimalTuple and CLUSTER routines, though.
 381          */
 382         TupleDesc       tupDesc;
 383         SortSupport sortKeys;           /* array of length nKeys */
 384
 385         /*
 386          * This variable is shared by the single-key MinimalTuple case and the
 387          * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
 388          */
 389         SortSupport onlyKey;
 390
 391         /*
 392          * Additional state for managing "abbreviated key" sortsupport routines
 393          * (which currently may be used by all cases except the hash index case).
 394          * Tracks the intervals at which the optimization's effectiveness is
 395          * tested.
 396          */
 397         int64           abbrevNext;             /* Tuple # at which to next check
 398                                                                  * applicability */
 399
 400         /*
 401          * These variables are specific to the CLUSTER case; they are set by
 402          * tuplesort_begin_cluster.
 403          */
 404         IndexInfo  *indexInfo;          /* info about index being used for reference */
 405         EState     *estate;                     /* for evaluating index expressions */
 406
 407         /*
 408          * These variables are specific to the IndexTuple case; they are set by
 409          * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
 410          */
 411         Relation        heapRel;                /* table the index is being built on */
 412         Relation        indexRel;               /* index being built */
 413
 414         /* These are specific to the index_btree subcase: */
 415         bool            enforceUnique;  /* complain if we find duplicate tuples */
 416
 417         /* These are specific to the index_hash subcase: */
 418         uint32          high_mask;              /* masks for sortable part of hash code */
 419         uint32          low_mask;
 420         uint32          max_buckets;
 421
 422         /*
 423          * These variables are specific to the Datum case; they are set by
 424          * tuplesort_begin_datum and used only by the DatumTuple routines.
 425          */
 426         Oid                     datumType;
 427         /* we need typelen in order to know how to copy the Datums. */
 428         int                     datumTypeLen;
 429
 430         /*
 431          * Resource snapshot for time of sort start.
 432          */
 433 #ifdef TRACE_SORT
 434         PGRUsage        ru_start;
 435 #endif
 436 };
 437
 438 /*
 439  * Is the given tuple allocated from the slab memory arena?
 440  */
 441 #define IS_SLAB_SLOT(state, tuple) \
 442         ((char *) (tuple) >= (state)->slabMemoryBegin && \
 443          (char *) (tuple) < (state)->slabMemoryEnd)
 444
 445 /*
 446  * Return the given tuple to the slab memory free list, or free it
 447  * if it was palloc'd.
 448  */
 449 #define RELEASE_SLAB_SLOT(state, tuple) \
 450         do { \
 451                 SlabSlot *buf = (SlabSlot *) tuple; \
 452                 \
 453                 if (IS_SLAB_SLOT((state), buf)) \
 454                 { \
 455                         buf->nextfree = (state)->slabFreeHead; \
 456                         (state)->slabFreeHead = buf; \
 457                 } else \
 458                         pfree(buf); \
 459         } while(0)
 460
 461 #define COMPARETUP(state,a,b)   ((*(state)->comparetup) (a, b, state))
 462 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
 463 #define WRITETUP(state,tape,stup)       ((*(state)->writetup) (state, tape, stup))
 464 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
 465 #define LACKMEM(state)          ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
 466 #define USEMEM(state,amt)       ((state)->availMem -= (amt))
 467 #define FREEMEM(state,amt)      ((state)->availMem += (amt))
 468
 469 /*
 470  * NOTES about on-tape representation of tuples:
 471  *
 472  * We require the first "unsigned int" of a stored tuple to be the total size
 473  * on-tape of the tuple, including itself (so it is never zero; an all-zero
 474  * unsigned int is used to delimit runs).  The remainder of the stored tuple
 475  * may or may not match the in-memory representation of the tuple ---
 476  * any conversion needed is the job of the writetup and readtup routines.
 477  *
 478  * If state->randomAccess is true, then the stored representation of the
 479  * tuple must be followed by another "unsigned int" that is a copy of the
 480  * length --- so the total tape space used is actually sizeof(unsigned int)
 481  * more than the stored length value.  This allows read-backwards.  When
 482  * randomAccess is not true, the write/read routines may omit the extra
 483  * length word.
 484  *
 485  * writetup is expected to write both length words as well as the tuple
 486  * data.  When readtup is called, the tape is positioned just after the
 487  * front length word; readtup must read the tuple data and advance past
 488  * the back length word (if present).
 489  *
 490  * The write/read routines can make use of the tuple description data
 491  * stored in the Tuplesortstate record, if needed.  They are also expected
 492  * to adjust state->availMem by the amount of memory space (not tape space!)
 493  * released or consumed.  There is no error return from either writetup
 494  * or readtup; they should ereport() on failure.
 495  *
 496  *
 497  * NOTES about memory consumption calculations:
 498  *
 499  * We count space allocated for tuples against the workMem limit, plus
 500  * the space used by the variable-size memtuples array.  Fixed-size space
 501  * is not counted; it's small enough to not be interesting.
 502  *
 503  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
 504  * rather than the originally-requested size.  This is important since
 505  * palloc can add substantial overhead.  It's not a complete answer since
 506  * we won't count any wasted space in palloc allocation blocks, but it's
 507  * a lot better than what we were doing before 7.3.  As of 9.6, a
 508  * separate memory context is used for caller passed tuples.  Resetting
 509  * it at certain key increments significantly ameliorates fragmentation.
 510  * Note that this places a responsibility on readtup and copytup routines
 511  * to use the right memory context for these tuples (and to not use the
 512  * reset context for anything whose lifetime needs to span multiple
 513  * external sort runs).
 514  */
 515
 516 /* When using this macro, beware of double evaluation of len */
 517 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
 518         do { \
 519                 if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
 520                         elog(ERROR, "unexpected end of data"); \
 521         } while(0)
 522
 523
 524 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
 525 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 526 static bool consider_abort_common(Tuplesortstate *state);
 527 static void inittapes(Tuplesortstate *state);
 528 static void selectnewtape(Tuplesortstate *state);
 529 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
 530 static void mergeruns(Tuplesortstate *state);
 531 static void mergeonerun(Tuplesortstate *state);
 532 static void beginmerge(Tuplesortstate *state);
 533 static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
 534 static void dumptuples(Tuplesortstate *state, bool alltuples);
 535 static void make_bounded_heap(Tuplesortstate *state);
 536 static void sort_bounded_heap(Tuplesortstate *state);
 537 static void tuplesort_sort_memtuples(Tuplesortstate *state);
 538 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple);
 539 static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
 540 static void tuplesort_heap_delete_top(Tuplesortstate *state);
 541 static void reversedirection(Tuplesortstate *state);
 542 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 543 static void markrunend(Tuplesortstate *state, int tapenum);
 544 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
 545 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 546                                 Tuplesortstate *state);
 547 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
 548 static void writetup_heap(Tuplesortstate *state, int tapenum,
 549                           SortTuple *stup);
 550 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
 551                          int tapenum, unsigned int len);
 552 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
 553                                    Tuplesortstate *state);
 554 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
 555 static void writetup_cluster(Tuplesortstate *state, int tapenum,
 556                                  SortTuple *stup);
 557 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 558                                 int tapenum, unsigned int len);
 559 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 560                                            Tuplesortstate *state);
 561 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 562                                           Tuplesortstate *state);
 563 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
 564 static void writetup_index(Tuplesortstate *state, int tapenum,
 565                            SortTuple *stup);
 566 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
 567                           int tapenum, unsigned int len);
 568 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
 569                                  Tuplesortstate *state);
 570 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
 571 static void writetup_datum(Tuplesortstate *state, int tapenum,
 572                            SortTuple *stup);
 573 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
 574                           int tapenum, unsigned int len);
 575 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
 576
 577 /*
 578  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
 579  * any variant of SortTuples, using the appropriate comparetup function.
 580  * qsort_ssup() is specialized for the case where the comparetup function
 581  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
 582  * and Datum sorts.
 583  */
 584 #include "qsort_tuple.c"
 585
 586
 587 /*
 588  *              tuplesort_begin_xxx
 589  *
 590  * Initialize for a tuple sort operation.
 591  *
 592  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
 593  * zero or more times, then call tuplesort_performsort when all the tuples
 594  * have been supplied.  After performsort, retrieve the tuples in sorted
 595  * order by calling tuplesort_getXXX until it returns false/NULL.  (If random
 596  * access was requested, rescan, markpos, and restorepos can also be called.)
 597  * Call tuplesort_end to terminate the operation and release memory/disk space.
 598  *
 599  * Each variant of tuplesort_begin has a workMem parameter specifying the
 600  * maximum number of kilobytes of RAM to use before spilling data to disk.
 601  * (The normal value of this parameter is work_mem, but some callers use
 602  * other values.)  Each variant also has a randomAccess parameter specifying
 603  * whether the caller needs non-sequential access to the sort result.
 604  */
 605
 606 static Tuplesortstate *
 607 tuplesort_begin_common(int workMem, bool randomAccess)
 608 {
 609         Tuplesortstate *state;
 610         MemoryContext sortcontext;
 611         MemoryContext tuplecontext;
 612         MemoryContext oldcontext;
 613
 614         /*
 615          * Create a working memory context for this sort operation. All data
 616          * needed by the sort will live inside this context.
 617          */
 618         sortcontext = AllocSetContextCreate(CurrentMemoryContext,
 619                                                                                 "TupleSort main",
 620                                                                                 ALLOCSET_DEFAULT_SIZES);
 621
 622         /*
 623          * Caller tuple (e.g. IndexTuple) memory context.
 624          *
 625          * A dedicated child context used exclusively for caller passed tuples
 626          * eases memory management.  Resetting at key points reduces
 627          * fragmentation. Note that the memtuples array of SortTuples is allocated
 628          * in the parent context, not this context, because there is no need to
 629          * free memtuples early.
 630          */
 631         tuplecontext = AllocSetContextCreate(sortcontext,
 632                                                                                  "Caller tuples",
 633                                                                                  ALLOCSET_DEFAULT_SIZES);
 634
 635         /*
 636          * Make the Tuplesortstate within the per-sort context.  This way, we
 637          * don't need a separate pfree() operation for it at shutdown.
 638          */
 639         oldcontext = MemoryContextSwitchTo(sortcontext);
 640
 641         state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
 642
 643 #ifdef TRACE_SORT
 644         if (trace_sort)
 645                 pg_rusage_init(&state->ru_start);
 646 #endif
 647
 648         state->status = TSS_INITIAL;
 649         state->randomAccess = randomAccess;
 650         state->bounded = false;
 651         state->tuples = true;
 652         state->boundUsed = false;
 653         state->allowedMem = workMem * (int64) 1024;
 654         state->availMem = state->allowedMem;
 655         state->sortcontext = sortcontext;
 656         state->tuplecontext = tuplecontext;
 657         state->tapeset = NULL;
 658
 659         state->memtupcount = 0;
 660
 661         /*
 662          * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
 663          * see comments in grow_memtuples().
 664          */
 665         state->memtupsize = Max(1024,
 666                                                         ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
 667
 668         state->growmemtuples = true;
 669         state->slabAllocatorUsed = false;
 670         state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
 671
 672         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 673
 674         /* workMem must be large enough for the minimal memtuples array */
 675         if (LACKMEM(state))
 676                 elog(ERROR, "insufficient memory allowed for sort");
 677
 678         state->currentRun = 0;
 679
 680         /*
 681          * maxTapes, tapeRange, and Algorithm D variables will be initialized by
 682          * inittapes(), if needed
 683          */
 684
 685         state->result_tape = -1;        /* flag that result tape has not been formed */
 686
 687         MemoryContextSwitchTo(oldcontext);
 688
 689         return state;
 690 }
 691
 692 Tuplesortstate *
 693 tuplesort_begin_heap(TupleDesc tupDesc,
 694                                          int nkeys, AttrNumber *attNums,
 695                                          Oid *sortOperators, Oid *sortCollations,
 696                                          bool *nullsFirstFlags,
 697                                          int workMem, bool randomAccess)
 698 {
 699         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 700         MemoryContext oldcontext;
 701         int                     i;
 702
 703         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 704
 705         AssertArg(nkeys > 0);
 706
 707 #ifdef TRACE_SORT
 708         if (trace_sort)
 709                 elog(LOG,
 710                          "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
 711                          nkeys, workMem, randomAccess ? 't' : 'f');
 712 #endif
 713
 714         state->nKeys = nkeys;
 715
 716         TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
 717                                                                 false,  /* no unique check */
 718                                                                 nkeys,
 719                                                                 workMem,
 720                                                                 randomAccess);
 721
 722         state->comparetup = comparetup_heap;
 723         state->copytup = copytup_heap;
 724         state->writetup = writetup_heap;
 725         state->readtup = readtup_heap;
 726
 727         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
 728         state->abbrevNext = 10;
 729
 730         /* Prepare SortSupport data for each column */
 731         state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
 732
 733         for (i = 0; i < nkeys; i++)
 734         {
 735                 SortSupport sortKey = state->sortKeys + i;
 736
 737                 AssertArg(attNums[i] != 0);
 738                 AssertArg(sortOperators[i] != 0);
 739
 740                 sortKey->ssup_cxt = CurrentMemoryContext;
 741                 sortKey->ssup_collation = sortCollations[i];
 742                 sortKey->ssup_nulls_first = nullsFirstFlags[i];
 743                 sortKey->ssup_attno = attNums[i];
 744                 /* Convey if abbreviation optimization is applicable in principle */
 745                 sortKey->abbreviate = (i == 0);
 746
 747                 PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
 748         }
 749
 750         /*
 751          * The "onlyKey" optimization cannot be used with abbreviated keys, since
 752          * tie-breaker comparisons may be required.  Typically, the optimization
 753          * is only of value to pass-by-value types anyway, whereas abbreviated
 754          * keys are typically only of value to pass-by-reference types.
 755          */
 756         if (nkeys == 1 && !state->sortKeys->abbrev_converter)
 757                 state->onlyKey = state->sortKeys;
 758
 759         MemoryContextSwitchTo(oldcontext);
 760
 761         return state;
 762 }
 763
 764 Tuplesortstate *
 765 tuplesort_begin_cluster(TupleDesc tupDesc,
 766                                                 Relation indexRel,
 767                                                 int workMem, bool randomAccess)
 768 {
 769         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 770         ScanKey         indexScanKey;
 771         MemoryContext oldcontext;
 772         int                     i;
 773
 774         Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
 775
 776         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 777
 778 #ifdef TRACE_SORT
 779         if (trace_sort)
 780                 elog(LOG,
 781                          "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
 782                          RelationGetNumberOfAttributes(indexRel),
 783                          workMem, randomAccess ? 't' : 'f');
 784 #endif
 785
 786         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 787
 788         TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
 789                                                                 false,  /* no unique check */
 790                                                                 state->nKeys,
 791                                                                 workMem,
 792                                                                 randomAccess);
 793
 794         state->comparetup = comparetup_cluster;
 795         state->copytup = copytup_cluster;
 796         state->writetup = writetup_cluster;
 797         state->readtup = readtup_cluster;
 798         state->abbrevNext = 10;
 799
 800         state->indexInfo = BuildIndexInfo(indexRel);
 801
 802         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
 803
 804         indexScanKey = _bt_mkscankey_nodata(indexRel);
 805
 806         if (state->indexInfo->ii_Expressions != NULL)
 807         {
 808                 TupleTableSlot *slot;
 809                 ExprContext *econtext;
 810
 811                 /*
 812                  * We will need to use FormIndexDatum to evaluate the index
 813                  * expressions.  To do that, we need an EState, as well as a
 814                  * TupleTableSlot to put the table tuples into.  The econtext's
 815                  * scantuple has to point to that slot, too.
 816                  */
 817                 state->estate = CreateExecutorState();
 818                 slot = MakeSingleTupleTableSlot(tupDesc);
 819                 econtext = GetPerTupleExprContext(state->estate);
 820                 econtext->ecxt_scantuple = slot;
 821         }
 822
 823         /* Prepare SortSupport data for each column */
 824         state->sortKeys = (SortSupport) palloc0(state->nKeys *
 825                                                                                         sizeof(SortSupportData));
 826
 827         for (i = 0; i < state->nKeys; i++)
 828         {
 829                 SortSupport sortKey = state->sortKeys + i;
 830                 ScanKey         scanKey = indexScanKey + i;
 831                 int16           strategy;
 832
 833                 sortKey->ssup_cxt = CurrentMemoryContext;
 834                 sortKey->ssup_collation = scanKey->sk_collation;
 835                 sortKey->ssup_nulls_first =
 836                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 837                 sortKey->ssup_attno = scanKey->sk_attno;
 838                 /* Convey if abbreviation optimization is applicable in principle */
 839                 sortKey->abbreviate = (i == 0);
 840
 841                 AssertState(sortKey->ssup_attno != 0);
 842
 843                 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
 844                         BTGreaterStrategyNumber : BTLessStrategyNumber;
 845
 846                 PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 847         }
 848
 849         _bt_freeskey(indexScanKey);
 850
 851         MemoryContextSwitchTo(oldcontext);
 852
 853         return state;
 854 }
 855
 856 Tuplesortstate *
 857 tuplesort_begin_index_btree(Relation heapRel,
 858                                                         Relation indexRel,
 859                                                         bool enforceUnique,
 860                                                         int workMem, bool randomAccess)
 861 {
 862         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 863         ScanKey         indexScanKey;
 864         MemoryContext oldcontext;
 865         int                     i;
 866
 867         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 868
 869 #ifdef TRACE_SORT
 870         if (trace_sort)
 871                 elog(LOG,
 872                          "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
 873                          enforceUnique ? 't' : 'f',
 874                          workMem, randomAccess ? 't' : 'f');
 875 #endif
 876
 877         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 878
 879         TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
 880                                                                 enforceUnique,
 881                                                                 state->nKeys,
 882                                                                 workMem,
 883                                                                 randomAccess);
 884
 885         state->comparetup = comparetup_index_btree;
 886         state->copytup = copytup_index;
 887         state->writetup = writetup_index;
 888         state->readtup = readtup_index;
 889         state->abbrevNext = 10;
 890
 891         state->heapRel = heapRel;
 892         state->indexRel = indexRel;
 893         state->enforceUnique = enforceUnique;
 894
 895         indexScanKey = _bt_mkscankey_nodata(indexRel);
 896         state->nKeys = RelationGetNumberOfAttributes(indexRel);
 897
 898         /* Prepare SortSupport data for each column */
 899         state->sortKeys = (SortSupport) palloc0(state->nKeys *
 900                                                                                         sizeof(SortSupportData));
 901
 902         for (i = 0; i < state->nKeys; i++)
 903         {
 904                 SortSupport sortKey = state->sortKeys + i;
 905                 ScanKey         scanKey = indexScanKey + i;
 906                 int16           strategy;
 907
 908                 sortKey->ssup_cxt = CurrentMemoryContext;
 909                 sortKey->ssup_collation = scanKey->sk_collation;
 910                 sortKey->ssup_nulls_first =
 911                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 912                 sortKey->ssup_attno = scanKey->sk_attno;
 913                 /* Convey if abbreviation optimization is applicable in principle */
 914                 sortKey->abbreviate = (i == 0);
 915
 916                 AssertState(sortKey->ssup_attno != 0);
 917
 918                 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
 919                         BTGreaterStrategyNumber : BTLessStrategyNumber;
 920
 921                 PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 922         }
 923
 924         _bt_freeskey(indexScanKey);
 925
 926         MemoryContextSwitchTo(oldcontext);
 927
 928         return state;
 929 }
 930
 931 Tuplesortstate *
 932 tuplesort_begin_index_hash(Relation heapRel,
 933                                                    Relation indexRel,
 934                                                    uint32 high_mask,
 935                                                    uint32 low_mask,
 936                                                    uint32 max_buckets,
 937                                                    int workMem, bool randomAccess)
 938 {
 939         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 940         MemoryContext oldcontext;
 941
 942         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 943
 944 #ifdef TRACE_SORT
 945         if (trace_sort)
 946                 elog(LOG,
 947                          "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
 948                          "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
 949                          high_mask,
 950                          low_mask,
 951                          max_buckets,
 952                          workMem, randomAccess ? 't' : 'f');
 953 #endif
 954
 955         state->nKeys = 1;                       /* Only one sort column, the hash code */
 956
 957         state->comparetup = comparetup_index_hash;
 958         state->copytup = copytup_index;
 959         state->writetup = writetup_index;
 960         state->readtup = readtup_index;
 961
 962         state->heapRel = heapRel;
 963         state->indexRel = indexRel;
 964
 965         state->high_mask = high_mask;
 966         state->low_mask = low_mask;
 967         state->max_buckets = max_buckets;
 968
 969         MemoryContextSwitchTo(oldcontext);
 970
 971         return state;
 972 }
 973
 974 Tuplesortstate *
 975 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 976                                           bool nullsFirstFlag,
 977                                           int workMem, bool randomAccess)
 978 {
 979         Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 980         MemoryContext oldcontext;
 981         int16           typlen;
 982         bool            typbyval;
 983
 984         oldcontext = MemoryContextSwitchTo(state->sortcontext);
 985
 986 #ifdef TRACE_SORT
 987         if (trace_sort)
 988                 elog(LOG,
 989                          "begin datum sort: workMem = %d, randomAccess = %c",
 990                          workMem, randomAccess ? 't' : 'f');
 991 #endif
 992
 993         state->nKeys = 1;                       /* always a one-column sort */
 994
 995         TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
 996                                                                 false,  /* no unique check */
 997                                                                 1,
 998                                                                 workMem,
 999                                                                 randomAccess);
1000
1001         state->comparetup = comparetup_datum;
1002         state->copytup = copytup_datum;
1003         state->writetup = writetup_datum;
1004         state->readtup = readtup_datum;
1005         state->abbrevNext = 10;
1006
1007         state->datumType = datumType;
1008
1009         /* lookup necessary attributes of the datum type */
1010         get_typlenbyval(datumType, &typlen, &typbyval);
1011         state->datumTypeLen = typlen;
1012         state->tuples = !typbyval;
1013
1014         /* Prepare SortSupport data */
1015         state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1016
1017         state->sortKeys->ssup_cxt = CurrentMemoryContext;
1018         state->sortKeys->ssup_collation = sortCollation;
1019         state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1020
1021         /*
1022          * Abbreviation is possible here only for by-reference types.  In theory,
1023          * a pass-by-value datatype could have an abbreviated form that is cheaper
1024          * to compare.  In a tuple sort, we could support that, because we can
1025          * always extract the original datum from the tuple is needed.  Here, we
1026          * can't, because a datum sort only stores a single copy of the datum; the
1027          * "tuple" field of each sortTuple is NULL.
1028          */
1029         state->sortKeys->abbreviate = !typbyval;
1030
1031         PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1032
1033         /*
1034          * The "onlyKey" optimization cannot be used with abbreviated keys, since
1035          * tie-breaker comparisons may be required.  Typically, the optimization
1036          * is only of value to pass-by-value types anyway, whereas abbreviated
1037          * keys are typically only of value to pass-by-reference types.
1038          */
1039         if (!state->sortKeys->abbrev_converter)
1040                 state->onlyKey = state->sortKeys;
1041
1042         MemoryContextSwitchTo(oldcontext);
1043
1044         return state;
1045 }
1046
1047 /*
1048  * tuplesort_set_bound
1049  *
1050  *      Advise tuplesort that at most the first N result tuples are required.
1051  *
1052  * Must be called before inserting any tuples.  (Actually, we could allow it
1053  * as long as the sort hasn't spilled to disk, but there seems no need for
1054  * delayed calls at the moment.)
1055  *
1056  * This is a hint only. The tuplesort may still return more tuples than
1057  * requested.
1058  */
1059 void
1060 tuplesort_set_bound(Tuplesortstate *state, int64 bound)
1061 {
1062         /* Assert we're called before loading any tuples */
1063         Assert(state->status == TSS_INITIAL);
1064         Assert(state->memtupcount == 0);
1065         Assert(!state->bounded);
1066
1067 #ifdef DEBUG_BOUNDED_SORT
1068         /* Honor GUC setting that disables the feature (for easy testing) */
1069         if (!optimize_bounded_sort)
1070                 return;
1071 #endif
1072
1073         /* We want to be able to compute bound * 2, so limit the setting */
1074         if (bound > (int64) (INT_MAX / 2))
1075                 return;
1076
1077         state->bounded = true;
1078         state->bound = (int) bound;
1079
1080         /*
1081          * Bounded sorts are not an effective target for abbreviated key
1082          * optimization.  Disable by setting state to be consistent with no
1083          * abbreviation support.
1084          */
1085         state->sortKeys->abbrev_converter = NULL;
1086         if (state->sortKeys->abbrev_full_comparator)
1087                 state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
1088
1089         /* Not strictly necessary, but be tidy */
1090         state->sortKeys->abbrev_abort = NULL;
1091         state->sortKeys->abbrev_full_comparator = NULL;
1092 }
1093
1094 /*
1095  * tuplesort_end
1096  *
1097  *      Release resources and clean up.
1098  *
1099  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1100  * pointing to garbage.  Be careful not to attempt to use or free such
1101  * pointers afterwards!
1102  */
1103 void
1104 tuplesort_end(Tuplesortstate *state)
1105 {
1106         /* context swap probably not needed, but let's be safe */
1107         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1108
1109 #ifdef TRACE_SORT
1110         long            spaceUsed;
1111
1112         if (state->tapeset)
1113                 spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1114         else
1115                 spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1116 #endif
1117
1118         /*
1119          * Delete temporary "tape" files, if any.
1120          *
1121          * Note: want to include this in reported total cost of sort, hence need
1122          * for two #ifdef TRACE_SORT sections.
1123          */
1124         if (state->tapeset)
1125                 LogicalTapeSetClose(state->tapeset);
1126
1127 #ifdef TRACE_SORT
1128         if (trace_sort)
1129         {
1130                 if (state->tapeset)
1131                         elog(LOG, "external sort ended, %ld disk blocks used: %s",
1132                                  spaceUsed, pg_rusage_show(&state->ru_start));
1133                 else
1134                         elog(LOG, "internal sort ended, %ld KB used: %s",
1135                                  spaceUsed, pg_rusage_show(&state->ru_start));
1136         }
1137
1138         TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1139 #else
1140
1141         /*
1142          * If you disabled TRACE_SORT, you can still probe sort__done, but you
1143          * ain't getting space-used stats.
1144          */
1145         TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1146 #endif
1147
1148         /* Free any execution state created for CLUSTER case */
1149         if (state->estate != NULL)
1150         {
1151                 ExprContext *econtext = GetPerTupleExprContext(state->estate);
1152
1153                 ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
1154                 FreeExecutorState(state->estate);
1155         }
1156
1157         MemoryContextSwitchTo(oldcontext);
1158
1159         /*
1160          * Free the per-sort memory context, thereby releasing all working memory,
1161          * including the Tuplesortstate struct itself.
1162          */
1163         MemoryContextDelete(state->sortcontext);
1164 }
1165
1166 /*
1167  * Grow the memtuples[] array, if possible within our memory constraint.  We
1168  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1169  * limit.  Return TRUE if we were able to enlarge the array, FALSE if not.
1170  *
1171  * Normally, at each increment we double the size of the array.  When doing
1172  * that would exceed a limit, we attempt one last, smaller increase (and then
1173  * clear the growmemtuples flag so we don't try any more).  That allows us to
1174  * use memory as fully as permitted; sticking to the pure doubling rule could
1175  * result in almost half going unused.  Because availMem moves around with
1176  * tuple addition/removal, we need some rule to prevent making repeated small
1177  * increases in memtupsize, which would just be useless thrashing.  The
1178  * growmemtuples flag accomplishes that and also prevents useless
1179  * recalculations in this function.
1180  */
1181 static bool
1182 grow_memtuples(Tuplesortstate *state)
1183 {
1184         int                     newmemtupsize;
1185         int                     memtupsize = state->memtupsize;
1186         int64           memNowUsed = state->allowedMem - state->availMem;
1187
1188         /* Forget it if we've already maxed out memtuples, per comment above */
1189         if (!state->growmemtuples)
1190                 return false;
1191
1192         /* Select new value of memtupsize */
1193         if (memNowUsed <= state->availMem)
1194         {
1195                 /*
1196                  * We've used no more than half of allowedMem; double our usage,
1197                  * clamping at INT_MAX tuples.
1198                  */
1199                 if (memtupsize < INT_MAX / 2)
1200                         newmemtupsize = memtupsize * 2;
1201                 else
1202                 {
1203                         newmemtupsize = INT_MAX;
1204                         state->growmemtuples = false;
1205                 }
1206         }
1207         else
1208         {
1209                 /*
1210                  * This will be the last increment of memtupsize.  Abandon doubling
1211                  * strategy and instead increase as much as we safely can.
1212                  *
1213                  * To stay within allowedMem, we can't increase memtupsize by more
1214                  * than availMem / sizeof(SortTuple) elements.  In practice, we want
1215                  * to increase it by considerably less, because we need to leave some
1216                  * space for the tuples to which the new array slots will refer.  We
1217                  * assume the new tuples will be about the same size as the tuples
1218                  * we've already seen, and thus we can extrapolate from the space
1219                  * consumption so far to estimate an appropriate new size for the
1220                  * memtuples array.  The optimal value might be higher or lower than
1221                  * this estimate, but it's hard to know that in advance.  We again
1222                  * clamp at INT_MAX tuples.
1223                  *
1224                  * This calculation is safe against enlarging the array so much that
1225                  * LACKMEM becomes true, because the memory currently used includes
1226                  * the present array; thus, there would be enough allowedMem for the
1227                  * new array elements even if no other memory were currently used.
1228                  *
1229                  * We do the arithmetic in float8, because otherwise the product of
1230                  * memtupsize and allowedMem could overflow.  Any inaccuracy in the
1231                  * result should be insignificant; but even if we computed a
1232                  * completely insane result, the checks below will prevent anything
1233                  * really bad from happening.
1234                  */
1235                 double          grow_ratio;
1236
1237                 grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1238                 if (memtupsize * grow_ratio < INT_MAX)
1239                         newmemtupsize = (int) (memtupsize * grow_ratio);
1240                 else
1241                         newmemtupsize = INT_MAX;
1242
1243                 /* We won't make any further enlargement attempts */
1244                 state->growmemtuples = false;
1245         }
1246
1247         /* Must enlarge array by at least one element, else report failure */
1248         if (newmemtupsize <= memtupsize)
1249                 goto noalloc;
1250
1251         /*
1252          * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize.  Clamp
1253          * to ensure our request won't be rejected.  Note that we can easily
1254          * exhaust address space before facing this outcome.  (This is presently
1255          * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1256          * don't rely on that at this distance.)
1257          */
1258         if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1259         {
1260                 newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1261                 state->growmemtuples = false;   /* can't grow any more */
1262         }
1263
1264         /*
1265          * We need to be sure that we do not cause LACKMEM to become true, else
1266          * the space management algorithm will go nuts.  The code above should
1267          * never generate a dangerous request, but to be safe, check explicitly
1268          * that the array growth fits within availMem.  (We could still cause
1269          * LACKMEM if the memory chunk overhead associated with the memtuples
1270          * array were to increase.  That shouldn't happen because we chose the
1271          * initial array size large enough to ensure that palloc will be treating
1272          * both old and new arrays as separate chunks.  But we'll check LACKMEM
1273          * explicitly below just in case.)
1274          */
1275         if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1276                 goto noalloc;
1277
1278         /* OK, do it */
1279         FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1280         state->memtupsize = newmemtupsize;
1281         state->memtuples = (SortTuple *)
1282                 repalloc_huge(state->memtuples,
1283                                           state->memtupsize * sizeof(SortTuple));
1284         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1285         if (LACKMEM(state))
1286                 elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1287         return true;
1288
1289 noalloc:
1290         /* If for any reason we didn't realloc, shut off future attempts */
1291         state->growmemtuples = false;
1292         return false;
1293 }
1294
1295 /*
1296  * Accept one tuple while collecting input data for sort.
1297  *
1298  * Note that the input data is always copied; the caller need not save it.
1299  */
1300 void
1301 tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
1302 {
1303         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1304         SortTuple       stup;
1305
1306         /*
1307          * Copy the given tuple into memory we control, and decrease availMem.
1308          * Then call the common code.
1309          */
1310         COPYTUP(state, &stup, (void *) slot);
1311
1312         puttuple_common(state, &stup);
1313
1314         MemoryContextSwitchTo(oldcontext);
1315 }
1316
1317 /*
1318  * Accept one tuple while collecting input data for sort.
1319  *
1320  * Note that the input data is always copied; the caller need not save it.
1321  */
1322 void
1323 tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
1324 {
1325         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1326         SortTuple       stup;
1327
1328         /*
1329          * Copy the given tuple into memory we control, and decrease availMem.
1330          * Then call the common code.
1331          */
1332         COPYTUP(state, &stup, (void *) tup);
1333
1334         puttuple_common(state, &stup);
1335
1336         MemoryContextSwitchTo(oldcontext);
1337 }
1338
1339 /*
1340  * Collect one index tuple while collecting input data for sort, building
1341  * it from caller-supplied values.
1342  */
1343 void
1344 tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
1345                                                           ItemPointer self, Datum *values,
1346                                                           bool *isnull)
1347 {
1348         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1349         SortTuple       stup;
1350         Datum           original;
1351         IndexTuple      tuple;
1352
1353         stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1354         tuple = ((IndexTuple) stup.tuple);
1355         tuple->t_tid = *self;
1356         USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1357         /* set up first-column key value */
1358         original = index_getattr(tuple,
1359                                                          1,
1360                                                          RelationGetDescr(state->indexRel),
1361                                                          &stup.isnull1);
1362
1363         MemoryContextSwitchTo(state->sortcontext);
1364
1365         if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1366         {
1367                 /*
1368                  * Store ordinary Datum representation, or NULL value.  If there is a
1369                  * converter it won't expect NULL values, and cost model is not
1370                  * required to account for NULL, so in that case we avoid calling
1371                  * converter and just set datum1 to zeroed representation (to be
1372                  * consistent, and to support cheap inequality tests for NULL
1373                  * abbreviated keys).
1374                  */
1375                 stup.datum1 = original;
1376         }
1377         else if (!consider_abort_common(state))
1378         {
1379                 /* Store abbreviated key representation */
1380                 stup.datum1 = state->sortKeys->abbrev_converter(original,
1381                                                                                                                 state->sortKeys);
1382         }
1383         else
1384         {
1385                 /* Abort abbreviation */
1386                 int                     i;
1387
1388                 stup.datum1 = original;
1389
1390                 /*
1391                  * Set state to be consistent with never trying abbreviation.
1392                  *
1393                  * Alter datum1 representation in already-copied tuples, so as to
1394                  * ensure a consistent representation (current tuple was just
1395                  * handled).  It does not matter if some dumped tuples are already
1396                  * sorted on tape, since serialized tuples lack abbreviated keys
1397                  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1398                  */
1399                 for (i = 0; i < state->memtupcount; i++)
1400                 {
1401                         SortTuple  *mtup = &state->memtuples[i];
1402
1403                         tuple = mtup->tuple;
1404                         mtup->datum1 = index_getattr(tuple,
1405                                                                                  1,
1406                                                                                  RelationGetDescr(state->indexRel),
1407                                                                                  &mtup->isnull1);
1408                 }
1409         }
1410
1411         puttuple_common(state, &stup);
1412
1413         MemoryContextSwitchTo(oldcontext);
1414 }
1415
1416 /*
1417  * Accept one Datum while collecting input data for sort.
1418  *
1419  * If the Datum is pass-by-ref type, the value will be copied.
1420  */
1421 void
1422 tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
1423 {
1424         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1425         SortTuple       stup;
1426
1427         /*
1428          * Pass-by-value types or null values are just stored directly in
1429          * stup.datum1 (and stup.tuple is not used and set to NULL).
1430          *
1431          * Non-null pass-by-reference values need to be copied into memory we
1432          * control, and possibly abbreviated. The copied value is pointed to by
1433          * stup.tuple and is treated as the canonical copy (e.g. to return via
1434          * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1435          * abbreviated value if abbreviation is happening, otherwise it's
1436          * identical to stup.tuple.
1437          */
1438
1439         if (isNull || !state->tuples)
1440         {
1441                 /*
1442                  * Set datum1 to zeroed representation for NULLs (to be consistent,
1443                  * and to support cheap inequality tests for NULL abbreviated keys).
1444                  */
1445                 stup.datum1 = !isNull ? val : (Datum) 0;
1446                 stup.isnull1 = isNull;
1447                 stup.tuple = NULL;              /* no separate storage */
1448                 MemoryContextSwitchTo(state->sortcontext);
1449         }
1450         else
1451         {
1452                 Datum           original = datumCopy(val, false, state->datumTypeLen);
1453
1454                 stup.isnull1 = false;
1455                 stup.tuple = DatumGetPointer(original);
1456                 USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1457                 MemoryContextSwitchTo(state->sortcontext);
1458
1459                 if (!state->sortKeys->abbrev_converter)
1460                 {
1461                         stup.datum1 = original;
1462                 }
1463                 else if (!consider_abort_common(state))
1464                 {
1465                         /* Store abbreviated key representation */
1466                         stup.datum1 = state->sortKeys->abbrev_converter(original,
1467                                                                                                                         state->sortKeys);
1468                 }
1469                 else
1470                 {
1471                         /* Abort abbreviation */
1472                         int                     i;
1473
1474                         stup.datum1 = original;
1475
1476                         /*
1477                          * Set state to be consistent with never trying abbreviation.
1478                          *
1479                          * Alter datum1 representation in already-copied tuples, so as to
1480                          * ensure a consistent representation (current tuple was just
1481                          * handled).  It does not matter if some dumped tuples are already
1482                          * sorted on tape, since serialized tuples lack abbreviated keys
1483                          * (TSS_BUILDRUNS state prevents control reaching here in any
1484                          * case).
1485                          */
1486                         for (i = 0; i < state->memtupcount; i++)
1487                         {
1488                                 SortTuple  *mtup = &state->memtuples[i];
1489
1490                                 mtup->datum1 = PointerGetDatum(mtup->tuple);
1491                         }
1492                 }
1493         }
1494
1495         puttuple_common(state, &stup);
1496
1497         MemoryContextSwitchTo(oldcontext);
1498 }
1499
1500 /*
1501  * Shared code for tuple and datum cases.
1502  */
1503 static void
1504 puttuple_common(Tuplesortstate *state, SortTuple *tuple)
1505 {
1506         switch (state->status)
1507         {
1508                 case TSS_INITIAL:
1509
1510                         /*
1511                          * Save the tuple into the unsorted array.  First, grow the array
1512                          * as needed.  Note that we try to grow the array when there is
1513                          * still one free slot remaining --- if we fail, there'll still be
1514                          * room to store the incoming tuple, and then we'll switch to
1515                          * tape-based operation.
1516                          */
1517                         if (state->memtupcount >= state->memtupsize - 1)
1518                         {
1519                                 (void) grow_memtuples(state);
1520                                 Assert(state->memtupcount < state->memtupsize);
1521                         }
1522                         state->memtuples[state->memtupcount++] = *tuple;
1523
1524                         /*
1525                          * Check if it's time to switch over to a bounded heapsort. We do
1526                          * so if the input tuple count exceeds twice the desired tuple
1527                          * count (this is a heuristic for where heapsort becomes cheaper
1528                          * than a quicksort), or if we've just filled workMem and have
1529                          * enough tuples to meet the bound.
1530                          *
1531                          * Note that once we enter TSS_BOUNDED state we will always try to
1532                          * complete the sort that way.  In the worst case, if later input
1533                          * tuples are larger than earlier ones, this might cause us to
1534                          * exceed workMem significantly.
1535                          */
1536                         if (state->bounded &&
1537                                 (state->memtupcount > state->bound * 2 ||
1538                                  (state->memtupcount > state->bound && LACKMEM(state))))
1539                         {
1540 #ifdef TRACE_SORT
1541                                 if (trace_sort)
1542                                         elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1543                                                  state->memtupcount,
1544                                                  pg_rusage_show(&state->ru_start));
1545 #endif
1546                                 make_bounded_heap(state);
1547                                 return;
1548                         }
1549
1550                         /*
1551                          * Done if we still fit in available memory and have array slots.
1552                          */
1553                         if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1554                                 return;
1555
1556                         /*
1557                          * Nope; time to switch to tape-based operation.
1558                          */
1559                         inittapes(state);
1560
1561                         /*
1562                          * Dump all tuples.
1563                          */
1564                         dumptuples(state, false);
1565                         break;
1566
1567                 case TSS_BOUNDED:
1568
1569                         /*
1570                          * We don't want to grow the array here, so check whether the new
1571                          * tuple can be discarded before putting it in.  This should be a
1572                          * good speed optimization, too, since when there are many more
1573                          * input tuples than the bound, most input tuples can be discarded
1574                          * with just this one comparison.  Note that because we currently
1575                          * have the sort direction reversed, we must check for <= not >=.
1576                          */
1577                         if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1578                         {
1579                                 /* new tuple <= top of the heap, so we can discard it */
1580                                 free_sort_tuple(state, tuple);
1581                                 CHECK_FOR_INTERRUPTS();
1582                         }
1583                         else
1584                         {
1585                                 /* discard top of heap, replacing it with the new tuple */
1586                                 free_sort_tuple(state, &state->memtuples[0]);
1587                                 tuplesort_heap_replace_top(state, tuple);
1588                         }
1589                         break;
1590
1591                 case TSS_BUILDRUNS:
1592
1593                         /*
1594                          * Save the tuple into the unsorted array (there must be
1595                          * space)
1596                          */
1597                         state->memtuples[state->memtupcount++] = *tuple;
1598
1599                         /*
1600                          * If we are over the memory limit, dump all tuples.
1601                          */
1602                         dumptuples(state, false);
1603                         break;
1604
1605                 default:
1606                         elog(ERROR, "invalid tuplesort state");
1607                         break;
1608         }
1609 }
1610
1611 static bool
1612 consider_abort_common(Tuplesortstate *state)
1613 {
1614         Assert(state->sortKeys[0].abbrev_converter != NULL);
1615         Assert(state->sortKeys[0].abbrev_abort != NULL);
1616         Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1617
1618         /*
1619          * Check effectiveness of abbreviation optimization.  Consider aborting
1620          * when still within memory limit.
1621          */
1622         if (state->status == TSS_INITIAL &&
1623                 state->memtupcount >= state->abbrevNext)
1624         {
1625                 state->abbrevNext *= 2;
1626
1627                 /*
1628                  * Check opclass-supplied abbreviation abort routine.  It may indicate
1629                  * that abbreviation should not proceed.
1630                  */
1631                 if (!state->sortKeys->abbrev_abort(state->memtupcount,
1632                                                                                    state->sortKeys))
1633                         return false;
1634
1635                 /*
1636                  * Finally, restore authoritative comparator, and indicate that
1637                  * abbreviation is not in play by setting abbrev_converter to NULL
1638                  */
1639                 state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
1640                 state->sortKeys[0].abbrev_converter = NULL;
1641                 /* Not strictly necessary, but be tidy */
1642                 state->sortKeys[0].abbrev_abort = NULL;
1643                 state->sortKeys[0].abbrev_full_comparator = NULL;
1644
1645                 /* Give up - expect original pass-by-value representation */
1646                 return true;
1647         }
1648
1649         return false;
1650 }
1651
1652 /*
1653  * All tuples have been provided; finish the sort.
1654  */
1655 void
1656 tuplesort_performsort(Tuplesortstate *state)
1657 {
1658         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1659
1660 #ifdef TRACE_SORT
1661         if (trace_sort)
1662                 elog(LOG, "performsort starting: %s",
1663                          pg_rusage_show(&state->ru_start));
1664 #endif
1665
1666         switch (state->status)
1667         {
1668                 case TSS_INITIAL:
1669
1670                         /*
1671                          * We were able to accumulate all the tuples within the allowed
1672                          * amount of memory.  Just qsort 'em and we're done.
1673                          */
1674                         tuplesort_sort_memtuples(state);
1675                         state->current = 0;
1676                         state->eof_reached = false;
1677                         state->markpos_offset = 0;
1678                         state->markpos_eof = false;
1679                         state->status = TSS_SORTEDINMEM;
1680                         break;
1681
1682                 case TSS_BOUNDED:
1683
1684                         /*
1685                          * We were able to accumulate all the tuples required for output
1686                          * in memory, using a heap to eliminate excess tuples.  Now we
1687                          * have to transform the heap to a properly-sorted array.
1688                          */
1689                         sort_bounded_heap(state);
1690                         state->current = 0;
1691                         state->eof_reached = false;
1692                         state->markpos_offset = 0;
1693                         state->markpos_eof = false;
1694                         state->status = TSS_SORTEDINMEM;
1695                         break;
1696
1697                 case TSS_BUILDRUNS:
1698
1699                         /*
1700                          * Finish tape-based sort.  First, flush all tuples remaining in
1701                          * memory out to tape; then merge until we have a single remaining
1702                          * run (or, if !randomAccess, one run per tape). Note that
1703                          * mergeruns sets the correct state->status.
1704                          */
1705                         dumptuples(state, true);
1706                         mergeruns(state);
1707                         state->eof_reached = false;
1708                         state->markpos_block = 0L;
1709                         state->markpos_offset = 0;
1710                         state->markpos_eof = false;
1711                         break;
1712
1713                 default:
1714                         elog(ERROR, "invalid tuplesort state");
1715                         break;
1716         }
1717
1718 #ifdef TRACE_SORT
1719         if (trace_sort)
1720         {
1721                 if (state->status == TSS_FINALMERGE)
1722                         elog(LOG, "performsort done (except %d-way final merge): %s",
1723                                  state->activeTapes,
1724                                  pg_rusage_show(&state->ru_start));
1725                 else
1726                         elog(LOG, "performsort done: %s",
1727                                  pg_rusage_show(&state->ru_start));
1728         }
1729 #endif
1730
1731         MemoryContextSwitchTo(oldcontext);
1732 }
1733
1734 /*
1735  * Internal routine to fetch the next tuple in either forward or back
1736  * direction into *stup.  Returns FALSE if no more tuples.
1737  * Returned tuple belongs to tuplesort memory context, and must not be freed
1738  * by caller.  Note that fetched tuple is stored in memory that may be
1739  * recycled by any future fetch.
1740  */
1741 static bool
1742 tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
1743                                                   SortTuple *stup)
1744 {
1745         unsigned int tuplen;
1746         size_t          nmoved;
1747
1748         switch (state->status)
1749         {
1750                 case TSS_SORTEDINMEM:
1751                         Assert(forward || state->randomAccess);
1752                         Assert(!state->slabAllocatorUsed);
1753                         if (forward)
1754                         {
1755                                 if (state->current < state->memtupcount)
1756                                 {
1757                                         *stup = state->memtuples[state->current++];
1758                                         return true;
1759                                 }
1760                                 state->eof_reached = true;
1761
1762                                 /*
1763                                  * Complain if caller tries to retrieve more tuples than
1764                                  * originally asked for in a bounded sort.  This is because
1765                                  * returning EOF here might be the wrong thing.
1766                                  */
1767                                 if (state->bounded && state->current >= state->bound)
1768                                         elog(ERROR, "retrieved too many tuples in a bounded sort");
1769
1770                                 return false;
1771                         }
1772                         else
1773                         {
1774                                 if (state->current <= 0)
1775                                         return false;
1776
1777                                 /*
1778                                  * if all tuples are fetched already then we return last
1779                                  * tuple, else - tuple before last returned.
1780                                  */
1781                                 if (state->eof_reached)
1782                                         state->eof_reached = false;
1783                                 else
1784                                 {
1785                                         state->current--;       /* last returned tuple */
1786                                         if (state->current <= 0)
1787                                                 return false;
1788                                 }
1789                                 *stup = state->memtuples[state->current - 1];
1790                                 return true;
1791                         }
1792                         break;
1793
1794                 case TSS_SORTEDONTAPE:
1795                         Assert(forward || state->randomAccess);
1796                         Assert(state->slabAllocatorUsed);
1797
1798                         /*
1799                          * The slot that held the tuple that we returned in previous
1800                          * gettuple call can now be reused.
1801                          */
1802                         if (state->lastReturnedTuple)
1803                         {
1804                                 RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1805                                 state->lastReturnedTuple = NULL;
1806                         }
1807
1808                         if (forward)
1809                         {
1810                                 if (state->eof_reached)
1811                                         return false;
1812
1813                                 if ((tuplen = getlen(state, state->result_tape, true)) != 0)
1814                                 {
1815                                         READTUP(state, stup, state->result_tape, tuplen);
1816
1817                                         /*
1818                                          * Remember the tuple we return, so that we can recycle
1819                                          * its memory on next call.  (This can be NULL, in the
1820                                          * !state->tuples case).
1821                                          */
1822                                         state->lastReturnedTuple = stup->tuple;
1823
1824                                         return true;
1825                                 }
1826                                 else
1827                                 {
1828                                         state->eof_reached = true;
1829                                         return false;
1830                                 }
1831                         }
1832
1833                         /*
1834                          * Backward.
1835                          *
1836                          * if all tuples are fetched already then we return last tuple,
1837                          * else - tuple before last returned.
1838                          */
1839                         if (state->eof_reached)
1840                         {
1841                                 /*
1842                                  * Seek position is pointing just past the zero tuplen at the
1843                                  * end of file; back up to fetch last tuple's ending length
1844                                  * word.  If seek fails we must have a completely empty file.
1845                                  */
1846                                 nmoved = LogicalTapeBackspace(state->tapeset,
1847                                                                                           state->result_tape,
1848                                                                                           2 * sizeof(unsigned int));
1849                                 if (nmoved == 0)
1850                                         return false;
1851                                 else if (nmoved != 2 * sizeof(unsigned int))
1852                                         elog(ERROR, "unexpected tape position");
1853                                 state->eof_reached = false;
1854                         }
1855                         else
1856                         {
1857                                 /*
1858                                  * Back up and fetch previously-returned tuple's ending length
1859                                  * word.  If seek fails, assume we are at start of file.
1860                                  */
1861                                 nmoved = LogicalTapeBackspace(state->tapeset,
1862                                                                                           state->result_tape,
1863                                                                                           sizeof(unsigned int));
1864                                 if (nmoved == 0)
1865                                         return false;
1866                                 else if (nmoved != sizeof(unsigned int))
1867                                         elog(ERROR, "unexpected tape position");
1868                                 tuplen = getlen(state, state->result_tape, false);
1869
1870                                 /*
1871                                  * Back up to get ending length word of tuple before it.
1872                                  */
1873                                 nmoved = LogicalTapeBackspace(state->tapeset,
1874                                                                                           state->result_tape,
1875                                                                                           tuplen + 2 * sizeof(unsigned int));
1876                                 if (nmoved == tuplen + sizeof(unsigned int))
1877                                 {
1878                                         /*
1879                                          * We backed up over the previous tuple, but there was no
1880                                          * ending length word before it.  That means that the prev
1881                                          * tuple is the first tuple in the file.  It is now the
1882                                          * next to read in forward direction (not obviously right,
1883                                          * but that is what in-memory case does).
1884                                          */
1885                                         return false;
1886                                 }
1887                                 else if (nmoved != tuplen + 2 * sizeof(unsigned int))
1888                                         elog(ERROR, "bogus tuple length in backward scan");
1889                         }
1890
1891                         tuplen = getlen(state, state->result_tape, false);
1892
1893                         /*
1894                          * Now we have the length of the prior tuple, back up and read it.
1895                          * Note: READTUP expects we are positioned after the initial
1896                          * length word of the tuple, so back up to that point.
1897                          */
1898                         nmoved = LogicalTapeBackspace(state->tapeset,
1899                                                                                   state->result_tape,
1900                                                                                   tuplen);
1901                         if (nmoved != tuplen)
1902                                 elog(ERROR, "bogus tuple length in backward scan");
1903                         READTUP(state, stup, state->result_tape, tuplen);
1904
1905                         /*
1906                          * Remember the tuple we return, so that we can recycle its memory
1907                          * on next call. (This can be NULL, in the Datum case).
1908                          */
1909                         state->lastReturnedTuple = stup->tuple;
1910
1911                         return true;
1912
1913                 case TSS_FINALMERGE:
1914                         Assert(forward);
1915                         /* We are managing memory ourselves, with the slab allocator. */
1916                         Assert(state->slabAllocatorUsed);
1917
1918                         /*
1919                          * The slab slot holding the tuple that we returned in previous
1920                          * gettuple call can now be reused.
1921                          */
1922                         if (state->lastReturnedTuple)
1923                         {
1924                                 RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1925                                 state->lastReturnedTuple = NULL;
1926                         }
1927
1928                         /*
1929                          * This code should match the inner loop of mergeonerun().
1930                          */
1931                         if (state->memtupcount > 0)
1932                         {
1933                                 int                     srcTape = state->memtuples[0].tupindex;
1934                                 SortTuple       newtup;
1935
1936                                 *stup = state->memtuples[0];
1937
1938                                 /*
1939                                  * Remember the tuple we return, so that we can recycle its
1940                                  * memory on next call. (This can be NULL, in the Datum case).
1941                                  */
1942                                 state->lastReturnedTuple = stup->tuple;
1943
1944                                 /*
1945                                  * Pull next tuple from tape, and replace the returned tuple
1946                                  * at top of the heap with it.
1947                                  */
1948                                 if (!mergereadnext(state, srcTape, &newtup))
1949                                 {
1950                                         /*
1951                                          * If no more data, we've reached end of run on this tape.
1952                                          * Remove the top node from the heap.
1953                                          */
1954                                         tuplesort_heap_delete_top(state);
1955
1956                                         /*
1957                                          * Rewind to free the read buffer.  It'd go away at the
1958                                          * end of the sort anyway, but better to release the
1959                                          * memory early.
1960                                          */
1961                                         LogicalTapeRewindForWrite(state->tapeset, srcTape);
1962                                         return true;
1963                                 }
1964                                 newtup.tupindex = srcTape;
1965                                 tuplesort_heap_replace_top(state, &newtup);
1966                                 return true;
1967                         }
1968                         return false;
1969
1970                 default:
1971                         elog(ERROR, "invalid tuplesort state");
1972                         return false;           /* keep compiler quiet */
1973         }
1974 }
1975
1976 /*
1977  * Fetch the next tuple in either forward or back direction.
1978  * If successful, put tuple in slot and return TRUE; else, clear the slot
1979  * and return FALSE.
1980  *
1981  * Caller may optionally be passed back abbreviated value (on TRUE return
1982  * value) when abbreviation was used, which can be used to cheaply avoid
1983  * equality checks that might otherwise be required.  Caller can safely make a
1984  * determination of "non-equal tuple" based on simple binary inequality.  A
1985  * NULL value in leading attribute will set abbreviated value to zeroed
1986  * representation, which caller may rely on in abbreviated inequality check.
1987  *
1988  * If copy is true, the slot receives a copied tuple that will stay valid
1989  * regardless of future manipulations of the tuplesort's state.  Memory is
1990  * owned by the caller.  If copy is false, the slot will just receive a
1991  * pointer to a tuple held within the tuplesort, which is more efficient, but
1992  * only safe for callers that are prepared to have any subsequent manipulation
1993  * of the tuplesort's state invalidate slot contents.
1994  */
1995 bool
1996 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
1997                                            TupleTableSlot *slot, Datum *abbrev)
1998 {
1999         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2000         SortTuple       stup;
2001
2002         if (!tuplesort_gettuple_common(state, forward, &stup))
2003                 stup.tuple = NULL;
2004
2005         MemoryContextSwitchTo(oldcontext);
2006
2007         if (stup.tuple)
2008         {
2009                 /* Record abbreviated key for caller */
2010                 if (state->sortKeys->abbrev_converter && abbrev)
2011                         *abbrev = stup.datum1;
2012
2013                 if (copy)
2014                         stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple);
2015
2016                 ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2017                 return true;
2018         }
2019         else
2020         {
2021                 ExecClearTuple(slot);
2022                 return false;
2023         }
2024 }
2025
2026 /*
2027  * Fetch the next tuple in either forward or back direction.
2028  * Returns NULL if no more tuples.  Returned tuple belongs to tuplesort memory
2029  * context, and must not be freed by caller.  Caller may not rely on tuple
2030  * remaining valid after any further manipulation of tuplesort.
2031  */
2032 HeapTuple
2033 tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
2034 {
2035         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2036         SortTuple       stup;
2037
2038         if (!tuplesort_gettuple_common(state, forward, &stup))
2039                 stup.tuple = NULL;
2040
2041         MemoryContextSwitchTo(oldcontext);
2042
2043         return stup.tuple;
2044 }
2045
2046 /*
2047  * Fetch the next index tuple in either forward or back direction.
2048  * Returns NULL if no more tuples.  Returned tuple belongs to tuplesort memory
2049  * context, and must not be freed by caller.  Caller may not rely on tuple
2050  * remaining valid after any further manipulation of tuplesort.
2051  */
2052 IndexTuple
2053 tuplesort_getindextuple(Tuplesortstate *state, bool forward)
2054 {
2055         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2056         SortTuple       stup;
2057
2058         if (!tuplesort_gettuple_common(state, forward, &stup))
2059                 stup.tuple = NULL;
2060
2061         MemoryContextSwitchTo(oldcontext);
2062
2063         return (IndexTuple) stup.tuple;
2064 }
2065
2066 /*
2067  * Fetch the next Datum in either forward or back direction.
2068  * Returns FALSE if no more datums.
2069  *
2070  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2071  * and is now owned by the caller (this differs from similar routines for
2072  * other types of tuplesorts).
2073  *
2074  * Caller may optionally be passed back abbreviated value (on TRUE return
2075  * value) when abbreviation was used, which can be used to cheaply avoid
2076  * equality checks that might otherwise be required.  Caller can safely make a
2077  * determination of "non-equal tuple" based on simple binary inequality.  A
2078  * NULL value will have a zeroed abbreviated value representation, which caller
2079  * may rely on in abbreviated inequality check.
2080  */
2081 bool
2082 tuplesort_getdatum(Tuplesortstate *state, bool forward,
2083                                    Datum *val, bool *isNull, Datum *abbrev)
2084 {
2085         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2086         SortTuple       stup;
2087
2088         if (!tuplesort_gettuple_common(state, forward, &stup))
2089         {
2090                 MemoryContextSwitchTo(oldcontext);
2091                 return false;
2092         }
2093
2094         /* Record abbreviated key for caller */
2095         if (state->sortKeys->abbrev_converter && abbrev)
2096                 *abbrev = stup.datum1;
2097
2098         if (stup.isnull1 || !state->tuples)
2099         {
2100                 *val = stup.datum1;
2101                 *isNull = stup.isnull1;
2102         }
2103         else
2104         {
2105                 /* use stup.tuple because stup.datum1 may be an abbreviation */
2106                 *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2107                 *isNull = false;
2108         }
2109
2110         MemoryContextSwitchTo(oldcontext);
2111
2112         return true;
2113 }
2114
2115 /*
2116  * Advance over N tuples in either forward or back direction,
2117  * without returning any data.  N==0 is a no-op.
2118  * Returns TRUE if successful, FALSE if ran out of tuples.
2119  */
2120 bool
2121 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2122 {
2123         MemoryContext oldcontext;
2124
2125         /*
2126          * We don't actually support backwards skip yet, because no callers need
2127          * it.  The API is designed to allow for that later, though.
2128          */
2129         Assert(forward);
2130         Assert(ntuples >= 0);
2131
2132         switch (state->status)
2133         {
2134                 case TSS_SORTEDINMEM:
2135                         if (state->memtupcount - state->current >= ntuples)
2136                         {
2137                                 state->current += ntuples;
2138                                 return true;
2139                         }
2140                         state->current = state->memtupcount;
2141                         state->eof_reached = true;
2142
2143                         /*
2144                          * Complain if caller tries to retrieve more tuples than
2145                          * originally asked for in a bounded sort.  This is because
2146                          * returning EOF here might be the wrong thing.
2147                          */
2148                         if (state->bounded && state->current >= state->bound)
2149                                 elog(ERROR, "retrieved too many tuples in a bounded sort");
2150
2151                         return false;
2152
2153                 case TSS_SORTEDONTAPE:
2154                 case TSS_FINALMERGE:
2155
2156                         /*
2157                          * We could probably optimize these cases better, but for now it's
2158                          * not worth the trouble.
2159                          */
2160                         oldcontext = MemoryContextSwitchTo(state->sortcontext);
2161                         while (ntuples-- > 0)
2162                         {
2163                                 SortTuple       stup;
2164
2165                                 if (!tuplesort_gettuple_common(state, forward, &stup))
2166                                 {
2167                                         MemoryContextSwitchTo(oldcontext);
2168                                         return false;
2169                                 }
2170                                 CHECK_FOR_INTERRUPTS();
2171                         }
2172                         MemoryContextSwitchTo(oldcontext);
2173                         return true;
2174
2175                 default:
2176                         elog(ERROR, "invalid tuplesort state");
2177                         return false;           /* keep compiler quiet */
2178         }
2179 }
2180
2181 /*
2182  * tuplesort_merge_order - report merge order we'll use for given memory
2183  * (note: "merge order" just means the number of input tapes in the merge).
2184  *
2185  * This is exported for use by the planner.  allowedMem is in bytes.
2186  */
2187 int
2188 tuplesort_merge_order(int64 allowedMem)
2189 {
2190         int                     mOrder;
2191
2192         /*
2193          * We need one tape for each merge input, plus another one for the output,
2194          * and each of these tapes needs buffer space.  In addition we want
2195          * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2196          * count).
2197          *
2198          * Note: you might be thinking we need to account for the memtuples[]
2199          * array in this calculation, but we effectively treat that as part of the
2200          * MERGE_BUFFER_SIZE workspace.
2201          */
2202         mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2203                 (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
2204
2205         /*
2206          * Even in minimum memory, use at least a MINORDER merge.  On the other
2207          * hand, even when we have lots of memory, do not use more than a MAXORDER
2208          * merge.  Tapes are pretty cheap, but they're not entirely free.  Each
2209          * additional tape reduces the amount of memory available to build runs,
2210          * which in turn can cause the same sort to need more runs, which makes
2211          * merging slower even if it can still be done in a single pass.  Also,
2212          * high order merges are quite slow due to CPU cache effects; it can be
2213          * faster to pay the I/O cost of a polyphase merge than to perform a
2214          * single merge pass across many hundreds of tapes.
2215          */
2216         mOrder = Max(mOrder, MINORDER);
2217         mOrder = Min(mOrder, MAXORDER);
2218
2219         return mOrder;
2220 }
2221
2222 /*
2223  * inittapes - initialize for tape sorting.
2224  *
2225  * This is called only if we have found we don't have room to sort in memory.
2226  */
2227 static void
2228 inittapes(Tuplesortstate *state)
2229 {
2230         int                     maxTapes,
2231                                 j;
2232         int64           tapeSpace;
2233
2234         /* Compute number of tapes to use: merge order plus 1 */
2235         maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2236
2237         state->maxTapes = maxTapes;
2238         state->tapeRange = maxTapes - 1;
2239
2240 #ifdef TRACE_SORT
2241         if (trace_sort)
2242                 elog(LOG, "switching to external sort with %d tapes: %s",
2243                          maxTapes, pg_rusage_show(&state->ru_start));
2244 #endif
2245
2246         /*
2247          * Decrease availMem to reflect the space needed for tape buffers, when
2248          * writing the initial runs; but don't decrease it to the point that we
2249          * have no room for tuples.  (That case is only likely to occur if sorting
2250          * pass-by-value Datums; in all other scenarios the memtuples[] array is
2251          * unlikely to occupy more than half of allowedMem.  In the pass-by-value
2252          * case it's not important to account for tuple space, so we don't care if
2253          * LACKMEM becomes inaccurate.)
2254          */
2255         tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2256
2257         if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2258                 USEMEM(state, tapeSpace);
2259
2260         /*
2261          * Make sure that the temp file(s) underlying the tape set are created in
2262          * suitable temp tablespaces.
2263          */
2264         PrepareTempTablespaces();
2265
2266         /*
2267          * Create the tape set and allocate the per-tape data arrays.
2268          */
2269         state->tapeset = LogicalTapeSetCreate(maxTapes);
2270
2271         state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2272         state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2273         state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2274         state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2275         state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2276
2277         state->currentRun = 0;
2278
2279         /*
2280          * Initialize variables of Algorithm D (step D1).
2281          */
2282         for (j = 0; j < maxTapes; j++)
2283         {
2284                 state->tp_fib[j] = 1;
2285                 state->tp_runs[j] = 0;
2286                 state->tp_dummy[j] = 1;
2287                 state->tp_tapenum[j] = j;
2288         }
2289         state->tp_fib[state->tapeRange] = 0;
2290         state->tp_dummy[state->tapeRange] = 0;
2291
2292         state->Level = 1;
2293         state->destTape = 0;
2294
2295         state->status = TSS_BUILDRUNS;
2296 }
2297
2298 /*
2299  * selectnewtape -- select new tape for new initial run.
2300  *
2301  * This is called after finishing a run when we know another run
2302  * must be started.  This implements steps D3, D4 of Algorithm D.
2303  */
2304 static void
2305 selectnewtape(Tuplesortstate *state)
2306 {
2307         int                     j;
2308         int                     a;
2309
2310         /* Step D3: advance j (destTape) */
2311         if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2312         {
2313                 state->destTape++;
2314                 return;
2315         }
2316         if (state->tp_dummy[state->destTape] != 0)
2317         {
2318                 state->destTape = 0;
2319                 return;
2320         }
2321
2322         /* Step D4: increase level */
2323         state->Level++;
2324         a = state->tp_fib[0];
2325         for (j = 0; j < state->tapeRange; j++)
2326         {
2327                 state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2328                 state->tp_fib[j] = a + state->tp_fib[j + 1];
2329         }
2330         state->destTape = 0;
2331 }
2332
2333 /*
2334  * Initialize the slab allocation arena, for the given number of slots.
2335  */
2336 static void
2337 init_slab_allocator(Tuplesortstate *state, int numSlots)
2338 {
2339         if (numSlots > 0)
2340         {
2341                 char       *p;
2342                 int                     i;
2343
2344                 state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2345                 state->slabMemoryEnd = state->slabMemoryBegin +
2346                         numSlots * SLAB_SLOT_SIZE;
2347                 state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2348                 USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2349
2350                 p = state->slabMemoryBegin;
2351                 for (i = 0; i < numSlots - 1; i++)
2352                 {
2353                         ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2354                         p += SLAB_SLOT_SIZE;
2355                 }
2356                 ((SlabSlot *) p)->nextfree = NULL;
2357         }
2358         else
2359         {
2360                 state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2361                 state->slabFreeHead = NULL;
2362         }
2363         state->slabAllocatorUsed = true;
2364 }
2365
2366 /*
2367  * mergeruns -- merge all the completed initial runs.
2368  *
2369  * This implements steps D5, D6 of Algorithm D.  All input data has
2370  * already been written to initial runs on tape (see dumptuples).
2371  */
2372 static void
2373 mergeruns(Tuplesortstate *state)
2374 {
2375         int                     tapenum,
2376                                 svTape,
2377                                 svRuns,
2378                                 svDummy;
2379         int                     numTapes;
2380         int                     numInputTapes;
2381
2382         Assert(state->status == TSS_BUILDRUNS);
2383         Assert(state->memtupcount == 0);
2384
2385         if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2386         {
2387                 /*
2388                  * If there are multiple runs to be merged, when we go to read back
2389                  * tuples from disk, abbreviated keys will not have been stored, and
2390                  * we don't care to regenerate them.  Disable abbreviation from this
2391                  * point on.
2392                  */
2393                 state->sortKeys->abbrev_converter = NULL;
2394                 state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
2395
2396                 /* Not strictly necessary, but be tidy */
2397                 state->sortKeys->abbrev_abort = NULL;
2398                 state->sortKeys->abbrev_full_comparator = NULL;
2399         }
2400
2401         /*
2402          * Reset tuple memory.  We've freed all the tuples that we previously
2403          * allocated.  We will use the slab allocator from now on.
2404          */
2405         MemoryContextDelete(state->tuplecontext);
2406         state->tuplecontext = NULL;
2407
2408         /*
2409          * We no longer need a large memtuples array.  (We will allocate a smaller
2410          * one for the heap later.)
2411          */
2412         FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2413         pfree(state->memtuples);
2414         state->memtuples = NULL;
2415
2416         /*
2417          * If we had fewer runs than tapes, refund the memory that we imagined we
2418          * would need for the tape buffers of the unused tapes.
2419          *
2420          * numTapes and numInputTapes reflect the actual number of tapes we will
2421          * use.  Note that the output tape's tape number is maxTapes - 1, so the
2422          * tape numbers of the used tapes are not consecutive, and you cannot just
2423          * loop from 0 to numTapes to visit all used tapes!
2424          */
2425         if (state->Level == 1)
2426         {
2427                 numInputTapes = state->currentRun;
2428                 numTapes = numInputTapes + 1;
2429                 FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
2430         }
2431         else
2432         {
2433                 numInputTapes = state->tapeRange;
2434                 numTapes = state->maxTapes;
2435         }
2436
2437         /*
2438          * Initialize the slab allocator.  We need one slab slot per input tape,
2439          * for the tuples in the heap, plus one to hold the tuple last returned
2440          * from tuplesort_gettuple.  (If we're sorting pass-by-val Datums,
2441          * however, we don't need to do allocate anything.)
2442          *
2443          * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2444          * to track memory usage of individual tuples.
2445          */
2446         if (state->tuples)
2447                 init_slab_allocator(state, numInputTapes + 1);
2448         else
2449                 init_slab_allocator(state, 0);
2450
2451         /*
2452          * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
2453          * from each input tape.
2454          */
2455         state->memtupsize = numInputTapes;
2456         state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
2457         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2458
2459         /*
2460          * Use all the remaining memory we have available for read buffers among
2461          * the input tapes.
2462          *
2463          * We do this only after checking for the case that we produced only one
2464          * initial run, because there is no need to use a large read buffer when
2465          * we're reading from a single tape.  With one tape, the I/O pattern will
2466          * be the same regardless of the buffer size.
2467          *
2468          * We don't try to "rebalance" the memory among tapes, when we start a new
2469          * merge phase, even if some tapes are inactive in the new phase.  That
2470          * would be hard, because logtape.c doesn't know where one run ends and
2471          * another begins.  When a new merge phase begins, and a tape doesn't
2472          * participate in it, its buffer nevertheless already contains tuples from
2473          * the next run on same tape, so we cannot release the buffer.  That's OK
2474          * in practice, merge performance isn't that sensitive to the amount of
2475          * buffers used, and most merge phases use all or almost all tapes,
2476          * anyway.
2477          */
2478 #ifdef TRACE_SORT
2479         if (trace_sort)
2480                 elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
2481                          (state->availMem) / 1024, numInputTapes);
2482 #endif
2483
2484         state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
2485         USEMEM(state, state->read_buffer_size * numInputTapes);
2486
2487         /* End of step D2: rewind all output tapes to prepare for merging */
2488         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2489                 LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size);
2490
2491         for (;;)
2492         {
2493                 /*
2494                  * At this point we know that tape[T] is empty.  If there's just one
2495                  * (real or dummy) run left on each input tape, then only one merge
2496                  * pass remains.  If we don't have to produce a materialized sorted
2497                  * tape, we can stop at this point and do the final merge on-the-fly.
2498                  */
2499                 if (!state->randomAccess)
2500                 {
2501                         bool            allOneRun = true;
2502
2503                         Assert(state->tp_runs[state->tapeRange] == 0);
2504                         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2505                         {
2506                                 if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2507                                 {
2508                                         allOneRun = false;
2509                                         break;
2510                                 }
2511                         }
2512                         if (allOneRun)
2513                         {
2514                                 /* Tell logtape.c we won't be writing anymore */
2515                                 LogicalTapeSetForgetFreeSpace(state->tapeset);
2516                                 /* Initialize for the final merge pass */
2517                                 beginmerge(state);
2518                                 state->status = TSS_FINALMERGE;
2519                                 return;
2520                         }
2521                 }
2522
2523                 /* Step D5: merge runs onto tape[T] until tape[P] is empty */
2524                 while (state->tp_runs[state->tapeRange - 1] ||
2525                            state->tp_dummy[state->tapeRange - 1])
2526                 {
2527                         bool            allDummy = true;
2528
2529                         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2530                         {
2531                                 if (state->tp_dummy[tapenum] == 0)
2532                                 {
2533                                         allDummy = false;
2534                                         break;
2535                                 }
2536                         }
2537
2538                         if (allDummy)
2539                         {
2540                                 state->tp_dummy[state->tapeRange]++;
2541                                 for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2542                                         state->tp_dummy[tapenum]--;
2543                         }
2544                         else
2545                                 mergeonerun(state);
2546                 }
2547
2548                 /* Step D6: decrease level */
2549                 if (--state->Level == 0)
2550                         break;
2551                 /* rewind output tape T to use as new input */
2552                 LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange],
2553                                                                  state->read_buffer_size);
2554                 /* rewind used-up input tape P, and prepare it for write pass */
2555                 LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]);
2556                 state->tp_runs[state->tapeRange - 1] = 0;
2557
2558                 /*
2559                  * reassign tape units per step D6; note we no longer care about A[]
2560                  */
2561                 svTape = state->tp_tapenum[state->tapeRange];
2562                 svDummy = state->tp_dummy[state->tapeRange];
2563                 svRuns = state->tp_runs[state->tapeRange];
2564                 for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2565                 {
2566                         state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2567                         state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2568                         state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2569                 }
2570                 state->tp_tapenum[0] = svTape;
2571                 state->tp_dummy[0] = svDummy;
2572                 state->tp_runs[0] = svRuns;
2573         }
2574
2575         /*
2576          * Done.  Knuth says that the result is on TAPE[1], but since we exited
2577          * the loop without performing the last iteration of step D6, we have not
2578          * rearranged the tape unit assignment, and therefore the result is on
2579          * TAPE[T].  We need to do it this way so that we can freeze the final
2580          * output tape while rewinding it.  The last iteration of step D6 would be
2581          * a waste of cycles anyway...
2582          */
2583         state->result_tape = state->tp_tapenum[state->tapeRange];
2584         LogicalTapeFreeze(state->tapeset, state->result_tape);
2585         state->status = TSS_SORTEDONTAPE;
2586
2587         /* Release the read buffers of all the other tapes, by rewinding them. */
2588         for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
2589         {
2590                 if (tapenum != state->result_tape)
2591                         LogicalTapeRewindForWrite(state->tapeset, tapenum);
2592         }
2593 }
2594
2595 /*
2596  * Merge one run from each input tape, except ones with dummy runs.
2597  *
2598  * This is the inner loop of Algorithm D step D5.  We know that the
2599  * output tape is TAPE[T].
2600  */
2601 static void
2602 mergeonerun(Tuplesortstate *state)
2603 {
2604         int                     destTape = state->tp_tapenum[state->tapeRange];
2605         int                     srcTape;
2606
2607         /*
2608          * Start the merge by loading one tuple from each active source tape into
2609          * the heap.  We can also decrease the input run/dummy run counts.
2610          */
2611         beginmerge(state);
2612
2613         /*
2614          * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2615          * out, and replacing it with next tuple from same tape (if there is
2616          * another one).
2617          */
2618         while (state->memtupcount > 0)
2619         {
2620                 SortTuple       stup;
2621
2622                 /* write the tuple to destTape */
2623                 srcTape = state->memtuples[0].tupindex;
2624                 WRITETUP(state, destTape, &state->memtuples[0]);
2625
2626                 /* recycle the slot of the tuple we just wrote out, for the next read */
2627                 if (state->memtuples[0].tuple)
2628                         RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
2629
2630                 /*
2631                  * pull next tuple from the tape, and replace the written-out tuple in
2632                  * the heap with it.
2633                  */
2634                 if (mergereadnext(state, srcTape, &stup))
2635                 {
2636                         stup.tupindex = srcTape;
2637                         tuplesort_heap_replace_top(state, &stup);
2638
2639                 }
2640                 else
2641                         tuplesort_heap_delete_top(state);
2642         }
2643
2644         /*
2645          * When the heap empties, we're done.  Write an end-of-run marker on the
2646          * output tape, and increment its count of real runs.
2647          */
2648         markrunend(state, destTape);
2649         state->tp_runs[state->tapeRange]++;
2650
2651 #ifdef TRACE_SORT
2652         if (trace_sort)
2653                 elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
2654                          pg_rusage_show(&state->ru_start));
2655 #endif
2656 }
2657
2658 /*
2659  * beginmerge - initialize for a merge pass
2660  *
2661  * We decrease the counts of real and dummy runs for each tape, and mark
2662  * which tapes contain active input runs in mergeactive[].  Then, fill the
2663  * merge heap with the first tuple from each active tape.
2664  */
2665 static void
2666 beginmerge(Tuplesortstate *state)
2667 {
2668         int                     activeTapes;
2669         int                     tapenum;
2670         int                     srcTape;
2671
2672         /* Heap should be empty here */
2673         Assert(state->memtupcount == 0);
2674
2675         /* Adjust run counts and mark the active tapes */
2676         memset(state->mergeactive, 0,
2677                    state->maxTapes * sizeof(*state->mergeactive));
2678         activeTapes = 0;
2679         for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2680         {
2681                 if (state->tp_dummy[tapenum] > 0)
2682                         state->tp_dummy[tapenum]--;
2683                 else
2684                 {
2685                         Assert(state->tp_runs[tapenum] > 0);
2686                         state->tp_runs[tapenum]--;
2687                         srcTape = state->tp_tapenum[tapenum];
2688                         state->mergeactive[srcTape] = true;
2689                         activeTapes++;
2690                 }
2691         }
2692         Assert(activeTapes > 0);
2693         state->activeTapes = activeTapes;
2694
2695         /* Load the merge heap with the first tuple from each input tape */
2696         for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2697         {
2698                 SortTuple       tup;
2699
2700                 if (mergereadnext(state, srcTape, &tup))
2701                 {
2702                         tup.tupindex = srcTape;
2703                         tuplesort_heap_insert(state, &tup);
2704                 }
2705         }
2706 }
2707
2708 /*
2709  * mergereadnext - read next tuple from one merge input tape
2710  *
2711  * Returns false on EOF.
2712  */
2713 static bool
2714 mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
2715 {
2716         unsigned int tuplen;
2717
2718         if (!state->mergeactive[srcTape])
2719                 return false;                   /* tape's run is already exhausted */
2720
2721         /* read next tuple, if any */
2722         if ((tuplen = getlen(state, srcTape, true)) == 0)
2723         {
2724                 state->mergeactive[srcTape] = false;
2725                 return false;
2726         }
2727         READTUP(state, stup, srcTape, tuplen);
2728
2729         return true;
2730 }
2731
2732 /*
2733  * dumptuples - remove tuples from memtuples and write initial run to tape
2734  *
2735  * When alltuples = true, dump everything currently in memory.  (This case is
2736  * only used at end of input data.)
2737  */
2738 static void
2739 dumptuples(Tuplesortstate *state, bool alltuples)
2740 {
2741         int                     memtupwrite;
2742         int                     i;
2743
2744         /*
2745          * Nothing to do if we still fit in available memory and have array
2746          * slots, unless this is the final call during initial run generation.
2747          */
2748         if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
2749                 !alltuples)
2750                 return;
2751
2752         /*
2753          * Final call might require no sorting, in rare cases where we just so
2754          * happen to have previously LACKMEM()'d at the point where exactly all
2755          * remaining tuples are loaded into memory, just before input was
2756          * exhausted.
2757          *
2758          * In general, short final runs are quite possible.  Rather than allowing
2759          * a special case where there was a superfluous selectnewtape() call (i.e.
2760          * a call with no subsequent run actually written to destTape), we prefer
2761          * to write out a 0 tuple run.
2762          *
2763          * mergereadnext() is prepared for 0 tuple runs, and will reliably mark
2764          * the tape inactive for the merge when called from beginmerge().  This
2765          * case is therefore similar to the case where mergeonerun() finds a dummy
2766          * run for the tape, and so doesn't need to merge a run from the tape (or
2767          * conceptually "merges" the dummy run, if you prefer).  According to
2768          * Knuth, Algorithm D "isn't strictly optimal" in its method of
2769          * distribution and dummy run assignment; this edge case seems very
2770          * unlikely to make that appreciably worse.
2771          */
2772         Assert(state->status == TSS_BUILDRUNS);
2773
2774         /*
2775          * It seems unlikely that this limit will ever be exceeded, but take no
2776          * chances
2777          */
2778         if (state->currentRun == INT_MAX)
2779                 ereport(ERROR,
2780                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2781                                  errmsg("cannot have more than %d runs for an external sort",
2782                                                 INT_MAX)));
2783
2784         state->currentRun++;
2785
2786 #ifdef TRACE_SORT
2787         if (trace_sort)
2788                 elog(LOG, "starting quicksort of run %d: %s",
2789                          state->currentRun, pg_rusage_show(&state->ru_start));
2790 #endif
2791
2792         /*
2793          * Sort all tuples accumulated within the allowed amount of memory for
2794          * this run using quicksort
2795          */
2796         tuplesort_sort_memtuples(state);
2797
2798 #ifdef TRACE_SORT
2799         if (trace_sort)
2800                 elog(LOG, "finished quicksort of run %d: %s",
2801                          state->currentRun, pg_rusage_show(&state->ru_start));
2802 #endif
2803
2804         memtupwrite = state->memtupcount;
2805         for (i = 0; i < memtupwrite; i++)
2806         {
2807                 WRITETUP(state, state->tp_tapenum[state->destTape],
2808                                  &state->memtuples[i]);
2809                 state->memtupcount--;
2810         }
2811
2812         /*
2813          * Reset tuple memory.  We've freed all of the tuples that we previously
2814          * allocated.  It's important to avoid fragmentation when there is a stark
2815          * change in the sizes of incoming tuples.  Fragmentation due to
2816          * AllocSetFree's bucketing by size class might be particularly bad if
2817          * this step wasn't taken.
2818          */
2819         MemoryContextReset(state->tuplecontext);
2820
2821         markrunend(state, state->tp_tapenum[state->destTape]);
2822         state->tp_runs[state->destTape]++;
2823         state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
2824
2825 #ifdef TRACE_SORT
2826         if (trace_sort)
2827                 elog(LOG, "finished writing run %d to tape %d: %s",
2828                          state->currentRun, state->destTape,
2829                          pg_rusage_show(&state->ru_start));
2830 #endif
2831
2832         if (!alltuples)
2833                 selectnewtape(state);
2834 }
2835
2836 /*
2837  * tuplesort_rescan             - rewind and replay the scan
2838  */
2839 void
2840 tuplesort_rescan(Tuplesortstate *state)
2841 {
2842         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2843
2844         Assert(state->randomAccess);
2845
2846         switch (state->status)
2847         {
2848                 case TSS_SORTEDINMEM:
2849                         state->current = 0;
2850                         state->eof_reached = false;
2851                         state->markpos_offset = 0;
2852                         state->markpos_eof = false;
2853                         break;
2854                 case TSS_SORTEDONTAPE:
2855                         LogicalTapeRewindForRead(state->tapeset,
2856                                                                          state->result_tape,
2857                                                                          0);
2858                         state->eof_reached = false;
2859                         state->markpos_block = 0L;
2860                         state->markpos_offset = 0;
2861                         state->markpos_eof = false;
2862                         break;
2863                 default:
2864                         elog(ERROR, "invalid tuplesort state");
2865                         break;
2866         }
2867
2868         MemoryContextSwitchTo(oldcontext);
2869 }
2870
2871 /*
2872  * tuplesort_markpos    - saves current position in the merged sort file
2873  */
2874 void
2875 tuplesort_markpos(Tuplesortstate *state)
2876 {
2877         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2878
2879         Assert(state->randomAccess);
2880
2881         switch (state->status)
2882         {
2883                 case TSS_SORTEDINMEM:
2884                         state->markpos_offset = state->current;
2885                         state->markpos_eof = state->eof_reached;
2886                         break;
2887                 case TSS_SORTEDONTAPE:
2888                         LogicalTapeTell(state->tapeset,
2889                                                         state->result_tape,
2890                                                         &state->markpos_block,
2891                                                         &state->markpos_offset);
2892                         state->markpos_eof = state->eof_reached;
2893                         break;
2894                 default:
2895                         elog(ERROR, "invalid tuplesort state");
2896                         break;
2897         }
2898
2899         MemoryContextSwitchTo(oldcontext);
2900 }
2901
2902 /*
2903  * tuplesort_restorepos - restores current position in merged sort file to
2904  *                                                last saved position
2905  */
2906 void
2907 tuplesort_restorepos(Tuplesortstate *state)
2908 {
2909         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2910
2911         Assert(state->randomAccess);
2912
2913         switch (state->status)
2914         {
2915                 case TSS_SORTEDINMEM:
2916                         state->current = state->markpos_offset;
2917                         state->eof_reached = state->markpos_eof;
2918                         break;
2919                 case TSS_SORTEDONTAPE:
2920                         LogicalTapeSeek(state->tapeset,
2921                                                         state->result_tape,
2922                                                         state->markpos_block,
2923                                                         state->markpos_offset);
2924                         state->eof_reached = state->markpos_eof;
2925                         break;
2926                 default:
2927                         elog(ERROR, "invalid tuplesort state");
2928                         break;
2929         }
2930
2931         MemoryContextSwitchTo(oldcontext);
2932 }
2933
2934 /*
2935  * tuplesort_get_stats - extract summary statistics
2936  *
2937  * This can be called after tuplesort_performsort() finishes to obtain
2938  * printable summary information about how the sort was performed.
2939  */
2940 void
2941 tuplesort_get_stats(Tuplesortstate *state,
2942                                         TuplesortInstrumentation *stats)
2943 {
2944         /*
2945          * Note: it might seem we should provide both memory and disk usage for a
2946          * disk-based sort.  However, the current code doesn't track memory space
2947          * accurately once we have begun to return tuples to the caller (since we
2948          * don't account for pfree's the caller is expected to do), so we cannot
2949          * rely on availMem in a disk sort.  This does not seem worth the overhead
2950          * to fix.  Is it worth creating an API for the memory context code to
2951          * tell us how much is actually used in sortcontext?
2952          */
2953         if (state->tapeset)
2954         {
2955                 stats->spaceType = SORT_SPACE_TYPE_DISK;
2956                 stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
2957         }
2958         else
2959         {
2960                 stats->spaceType = SORT_SPACE_TYPE_MEMORY;
2961                 stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
2962         }
2963
2964         switch (state->status)
2965         {
2966                 case TSS_SORTEDINMEM:
2967                         if (state->boundUsed)
2968                                 stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT;
2969                         else
2970                                 stats->sortMethod = SORT_TYPE_QUICKSORT;
2971                         break;
2972                 case TSS_SORTEDONTAPE:
2973                         stats->sortMethod = SORT_TYPE_EXTERNAL_SORT;
2974                         break;
2975                 case TSS_FINALMERGE:
2976                         stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE;
2977                         break;
2978                 default:
2979                         stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS;
2980                         break;
2981         }
2982 }
2983
2984 /*
2985  * Convert TuplesortMethod to a string.
2986  */
2987 const char *
2988 tuplesort_method_name(TuplesortMethod m)
2989 {
2990         switch (m)
2991         {
2992                 case SORT_TYPE_STILL_IN_PROGRESS:
2993                         return "still in progress";
2994                 case SORT_TYPE_TOP_N_HEAPSORT:
2995                         return "top-N heapsort";
2996                 case SORT_TYPE_QUICKSORT:
2997                         return "quicksort";
2998                 case SORT_TYPE_EXTERNAL_SORT:
2999                         return "external sort";
3000                 case SORT_TYPE_EXTERNAL_MERGE:
3001                         return "external merge";
3002         }
3003
3004         return "unknown";
3005 }
3006
3007 /*
3008  * Convert TuplesortSpaceType to a string.
3009  */
3010 const char *
3011 tuplesort_space_type_name(TuplesortSpaceType t)
3012 {
3013         Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY);
3014         return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
3015 }
3016
3017
3018 /*
3019  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3020  */
3021
3022 /*
3023  * Convert the existing unordered array of SortTuples to a bounded heap,
3024  * discarding all but the smallest "state->bound" tuples.
3025  *
3026  * When working with a bounded heap, we want to keep the largest entry
3027  * at the root (array entry zero), instead of the smallest as in the normal
3028  * sort case.  This allows us to discard the largest entry cheaply.
3029  * Therefore, we temporarily reverse the sort direction.
3030  */
3031 static void
3032 make_bounded_heap(Tuplesortstate *state)
3033 {
3034         int                     tupcount = state->memtupcount;
3035         int                     i;
3036
3037         Assert(state->status == TSS_INITIAL);
3038         Assert(state->bounded);
3039         Assert(tupcount >= state->bound);
3040
3041         /* Reverse sort direction so largest entry will be at root */
3042         reversedirection(state);
3043
3044         state->memtupcount = 0;         /* make the heap empty */
3045         for (i = 0; i < tupcount; i++)
3046         {
3047                 if (state->memtupcount < state->bound)
3048                 {
3049                         /* Insert next tuple into heap */
3050                         /* Must copy source tuple to avoid possible overwrite */
3051                         SortTuple       stup = state->memtuples[i];
3052
3053                         tuplesort_heap_insert(state, &stup);
3054                 }
3055                 else
3056                 {
3057                         /*
3058                          * The heap is full.  Replace the largest entry with the new
3059                          * tuple, or just discard it, if it's larger than anything already
3060                          * in the heap.
3061                          */
3062                         if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3063                         {
3064                                 free_sort_tuple(state, &state->memtuples[i]);
3065                                 CHECK_FOR_INTERRUPTS();
3066                         }
3067                         else
3068                                 tuplesort_heap_replace_top(state, &state->memtuples[i]);
3069                 }
3070         }
3071
3072         Assert(state->memtupcount == state->bound);
3073         state->status = TSS_BOUNDED;
3074 }
3075
3076 /*
3077  * Convert the bounded heap to a properly-sorted array
3078  */
3079 static void
3080 sort_bounded_heap(Tuplesortstate *state)
3081 {
3082         int                     tupcount = state->memtupcount;
3083
3084         Assert(state->status == TSS_BOUNDED);
3085         Assert(state->bounded);
3086         Assert(tupcount == state->bound);
3087
3088         /*
3089          * We can unheapify in place because each delete-top call will remove the
3090          * largest entry, which we can promptly store in the newly freed slot at
3091          * the end.  Once we're down to a single-entry heap, we're done.
3092          */
3093         while (state->memtupcount > 1)
3094         {
3095                 SortTuple       stup = state->memtuples[0];
3096
3097                 /* this sifts-up the next-largest entry and decreases memtupcount */
3098                 tuplesort_heap_delete_top(state);
3099                 state->memtuples[state->memtupcount] = stup;
3100         }
3101         state->memtupcount = tupcount;
3102
3103         /*
3104          * Reverse sort direction back to the original state.  This is not
3105          * actually necessary but seems like a good idea for tidiness.
3106          */
3107         reversedirection(state);
3108
3109         state->status = TSS_SORTEDINMEM;
3110         state->boundUsed = true;
3111 }
3112
3113 /*
3114  * Sort all memtuples using specialized qsort() routines.
3115  *
3116  * Quicksort is used for small in-memory sorts, and external sort runs.
3117  */
3118 static void
3119 tuplesort_sort_memtuples(Tuplesortstate *state)
3120 {
3121         if (state->memtupcount > 1)
3122         {
3123                 /* Can we use the single-key sort function? */
3124                 if (state->onlyKey != NULL)
3125                         qsort_ssup(state->memtuples, state->memtupcount,
3126                                            state->onlyKey);
3127                 else
3128                         qsort_tuple(state->memtuples,
3129                                                 state->memtupcount,
3130                                                 state->comparetup,
3131                                                 state);
3132         }
3133 }
3134
3135 /*
3136  * Insert a new tuple into an empty or existing heap, maintaining the
3137  * heap invariant.  Caller is responsible for ensuring there's room.
3138  *
3139  * Note: For some callers, tuple points to a memtuples[] entry above the
3140  * end of the heap.  This is safe as long as it's not immediately adjacent
3141  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3142  * is, it might get overwritten before being moved into the heap!
3143  */
3144 static void
3145 tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
3146 {
3147         SortTuple  *memtuples;
3148         int                     j;
3149
3150         memtuples = state->memtuples;
3151         Assert(state->memtupcount < state->memtupsize);
3152
3153         CHECK_FOR_INTERRUPTS();
3154
3155         /*
3156          * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3157          * using 1-based array indexes, not 0-based.
3158          */
3159         j = state->memtupcount++;
3160         while (j > 0)
3161         {
3162                 int                     i = (j - 1) >> 1;
3163
3164                 if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
3165                         break;
3166                 memtuples[j] = memtuples[i];
3167                 j = i;
3168         }
3169         memtuples[j] = *tuple;
3170 }
3171
3172 /*
3173  * Remove the tuple at state->memtuples[0] from the heap.  Decrement
3174  * memtupcount, and sift up to maintain the heap invariant.
3175  *
3176  * The caller has already free'd the tuple the top node points to,
3177  * if necessary.
3178  */
3179 static void
3180 tuplesort_heap_delete_top(Tuplesortstate *state)
3181 {
3182         SortTuple  *memtuples = state->memtuples;
3183         SortTuple  *tuple;
3184
3185         if (--state->memtupcount <= 0)
3186                 return;
3187
3188         /*
3189          * Remove the last tuple in the heap, and re-insert it, by replacing the
3190          * current top node with it.
3191          */
3192         tuple = &memtuples[state->memtupcount];
3193         tuplesort_heap_replace_top(state, tuple);
3194 }
3195
3196 /*
3197  * Replace the tuple at state->memtuples[0] with a new tuple.  Sift up to
3198  * maintain the heap invariant.
3199  *
3200  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3201  * Heapsort, steps H3-H8).
3202  */
3203 static void
3204 tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
3205 {
3206         SortTuple  *memtuples = state->memtuples;
3207         unsigned int i,
3208                                 n;
3209
3210         Assert(state->memtupcount >= 1);
3211
3212         CHECK_FOR_INTERRUPTS();
3213
3214         /*
3215          * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3216          * This prevents overflow in the "2 * i + 1" calculation, since at the top
3217          * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3218          */
3219         n = state->memtupcount;
3220         i = 0;                                          /* i is where the "hole" is */
3221         for (;;)
3222         {
3223                 unsigned int j = 2 * i + 1;
3224
3225                 if (j >= n)
3226                         break;
3227                 if (j + 1 < n &&
3228                         COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
3229                         j++;
3230                 if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
3231                         break;
3232                 memtuples[i] = memtuples[j];
3233                 i = j;
3234         }
3235         memtuples[i] = *tuple;
3236 }
3237
3238 /*
3239  * Function to reverse the sort direction from its current state
3240  *
3241  * It is not safe to call this when performing hash tuplesorts
3242  */
3243 static void
3244 reversedirection(Tuplesortstate *state)
3245 {
3246         SortSupport sortKey = state->sortKeys;
3247         int                     nkey;
3248
3249         for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3250         {
3251                 sortKey->ssup_reverse = !sortKey->ssup_reverse;
3252                 sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3253         }
3254 }
3255
3256
3257 /*
3258  * Tape interface routines
3259  */
3260
3261 static unsigned int
3262 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3263 {
3264         unsigned int len;
3265
3266         if (LogicalTapeRead(state->tapeset, tapenum,
3267                                                 &len, sizeof(len)) != sizeof(len))
3268                 elog(ERROR, "unexpected end of tape");
3269         if (len == 0 && !eofOK)
3270                 elog(ERROR, "unexpected end of data");
3271         return len;
3272 }
3273
3274 static void
3275 markrunend(Tuplesortstate *state, int tapenum)
3276 {
3277         unsigned int len = 0;
3278
3279         LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3280 }
3281
3282 /*
3283  * Get memory for tuple from within READTUP() routine.
3284  *
3285  * We use next free slot from the slab allocator, or palloc() if the tuple
3286  * is too large for that.
3287  */
3288 static void *
3289 readtup_alloc(Tuplesortstate *state, Size tuplen)
3290 {
3291         SlabSlot   *buf;
3292
3293         /*
3294          * We pre-allocate enough slots in the slab arena that we should never run
3295          * out.
3296          */
3297         Assert(state->slabFreeHead);
3298
3299         if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3300                 return MemoryContextAlloc(state->sortcontext, tuplen);
3301         else
3302         {
3303                 buf = state->slabFreeHead;
3304                 /* Reuse this slot */
3305                 state->slabFreeHead = buf->nextfree;
3306
3307                 return buf;
3308         }
3309 }
3310
3311
3312 /*
3313  * Routines specialized for HeapTuple (actually MinimalTuple) case
3314  */
3315
3316 static int
3317 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
3318 {
3319         SortSupport sortKey = state->sortKeys;
3320         HeapTupleData ltup;
3321         HeapTupleData rtup;
3322         TupleDesc       tupDesc;
3323         int                     nkey;
3324         int32           compare;
3325         AttrNumber      attno;
3326         Datum           datum1,
3327                                 datum2;
3328         bool            isnull1,
3329                                 isnull2;
3330
3331
3332         /* Compare the leading sort key */
3333         compare = ApplySortComparator(a->datum1, a->isnull1,
3334                                                                   b->datum1, b->isnull1,
3335                                                                   sortKey);
3336         if (compare != 0)
3337                 return compare;
3338
3339         /* Compare additional sort keys */
3340         ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3341         ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3342         rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3343         rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3344         tupDesc = state->tupDesc;
3345
3346         if (sortKey->abbrev_converter)
3347         {
3348                 attno = sortKey->ssup_attno;
3349
3350                 datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3351                 datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3352
3353                 compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3354                                                                                                 datum2, isnull2,
3355                                                                                                 sortKey);
3356                 if (compare != 0)
3357                         return compare;
3358         }
3359
3360         sortKey++;
3361         for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3362         {
3363                 attno = sortKey->ssup_attno;
3364
3365                 datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3366                 datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3367
3368                 compare = ApplySortComparator(datum1, isnull1,
3369                                                                           datum2, isnull2,
3370                                                                           sortKey);
3371                 if (compare != 0)
3372                         return compare;
3373         }
3374
3375         return 0;
3376 }
3377
3378 static void
3379 copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
3380 {
3381         /*
3382          * We expect the passed "tup" to be a TupleTableSlot, and form a
3383          * MinimalTuple using the exported interface for that.
3384          */
3385         TupleTableSlot *slot = (TupleTableSlot *) tup;
3386         Datum           original;
3387         MinimalTuple tuple;
3388         HeapTupleData htup;
3389         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3390
3391         /* copy the tuple into sort storage */
3392         tuple = ExecCopySlotMinimalTuple(slot);
3393         stup->tuple = (void *) tuple;
3394         USEMEM(state, GetMemoryChunkSpace(tuple));
3395         /* set up first-column key value */
3396         htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3397         htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3398         original = heap_getattr(&htup,
3399                                                         state->sortKeys[0].ssup_attno,
3400                                                         state->tupDesc,
3401                                                         &stup->isnull1);
3402
3403         MemoryContextSwitchTo(oldcontext);
3404
3405         if (!state->sortKeys->abbrev_converter || stup->isnull1)
3406         {
3407                 /*
3408                  * Store ordinary Datum representation, or NULL value.  If there is a
3409                  * converter it won't expect NULL values, and cost model is not
3410                  * required to account for NULL, so in that case we avoid calling
3411                  * converter and just set datum1 to zeroed representation (to be
3412                  * consistent, and to support cheap inequality tests for NULL
3413                  * abbreviated keys).
3414                  */
3415                 stup->datum1 = original;
3416         }
3417         else if (!consider_abort_common(state))
3418         {
3419                 /* Store abbreviated key representation */
3420                 stup->datum1 = state->sortKeys->abbrev_converter(original,
3421                                                                                                                  state->sortKeys);
3422         }
3423         else
3424         {
3425                 /* Abort abbreviation */
3426                 int                     i;
3427
3428                 stup->datum1 = original;
3429
3430                 /*
3431                  * Set state to be consistent with never trying abbreviation.
3432                  *
3433                  * Alter datum1 representation in already-copied tuples, so as to
3434                  * ensure a consistent representation (current tuple was just
3435                  * handled).  It does not matter if some dumped tuples are already
3436                  * sorted on tape, since serialized tuples lack abbreviated keys
3437                  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3438                  */
3439                 for (i = 0; i < state->memtupcount; i++)
3440                 {
3441                         SortTuple  *mtup = &state->memtuples[i];
3442
3443                         htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3444                                 MINIMAL_TUPLE_OFFSET;
3445                         htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3446                                                                                          MINIMAL_TUPLE_OFFSET);
3447
3448                         mtup->datum1 = heap_getattr(&htup,
3449                                                                                 state->sortKeys[0].ssup_attno,
3450                                                                                 state->tupDesc,
3451                                                                                 &mtup->isnull1);
3452                 }
3453         }
3454 }
3455
3456 static void
3457 writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
3458 {
3459         MinimalTuple tuple = (MinimalTuple) stup->tuple;
3460
3461         /* the part of the MinimalTuple we'll write: */
3462         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3463         unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3464
3465         /* total on-disk footprint: */
3466         unsigned int tuplen = tupbodylen + sizeof(int);
3467
3468         LogicalTapeWrite(state->tapeset, tapenum,
3469                                          (void *) &tuplen, sizeof(tuplen));
3470         LogicalTapeWrite(state->tapeset, tapenum,
3471                                          (void *) tupbody, tupbodylen);
3472         if (state->randomAccess)        /* need trailing length word? */
3473                 LogicalTapeWrite(state->tapeset, tapenum,
3474                                                  (void *) &tuplen, sizeof(tuplen));
3475
3476         if (!state->slabAllocatorUsed)
3477         {
3478                 FREEMEM(state, GetMemoryChunkSpace(tuple));
3479                 heap_free_minimal_tuple(tuple);
3480         }
3481 }
3482
3483 static void
3484 readtup_heap(Tuplesortstate *state, SortTuple *stup,
3485                          int tapenum, unsigned int len)
3486 {
3487         unsigned int tupbodylen = len - sizeof(int);
3488         unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3489         MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
3490         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3491         HeapTupleData htup;
3492
3493         /* read in the tuple proper */
3494         tuple->t_len = tuplen;
3495         LogicalTapeReadExact(state->tapeset, tapenum,
3496                                                  tupbody, tupbodylen);
3497         if (state->randomAccess)        /* need trailing length word? */
3498                 LogicalTapeReadExact(state->tapeset, tapenum,
3499                                                          &tuplen, sizeof(tuplen));
3500         stup->tuple = (void *) tuple;
3501         /* set up first-column key value */
3502         htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3503         htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3504         stup->datum1 = heap_getattr(&htup,
3505                                                                 state->sortKeys[0].ssup_attno,
3506                                                                 state->tupDesc,
3507                                                                 &stup->isnull1);
3508 }
3509
3510 /*
3511  * Routines specialized for the CLUSTER case (HeapTuple data, with
3512  * comparisons per a btree index definition)
3513  */
3514
3515 static int
3516 comparetup_cluster(const SortTuple *a, const SortTuple *b,
3517                                    Tuplesortstate *state)
3518 {
3519         SortSupport sortKey = state->sortKeys;
3520         HeapTuple       ltup;
3521         HeapTuple       rtup;
3522         TupleDesc       tupDesc;
3523         int                     nkey;
3524         int32           compare;
3525         Datum           datum1,
3526                                 datum2;
3527         bool            isnull1,
3528                                 isnull2;
3529         AttrNumber      leading = state->indexInfo->ii_KeyAttrNumbers[0];
3530
3531         /* Be prepared to compare additional sort keys */
3532         ltup = (HeapTuple) a->tuple;
3533         rtup = (HeapTuple) b->tuple;
3534         tupDesc = state->tupDesc;
3535
3536         /* Compare the leading sort key, if it's simple */
3537         if (leading != 0)
3538         {
3539                 compare = ApplySortComparator(a->datum1, a->isnull1,
3540                                                                           b->datum1, b->isnull1,
3541                                                                           sortKey);
3542                 if (compare != 0)
3543                         return compare;
3544
3545                 if (sortKey->abbrev_converter)
3546                 {
3547                         datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3548                         datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3549
3550                         compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3551                                                                                                         datum2, isnull2,
3552                                                                                                         sortKey);
3553                 }
3554                 if (compare != 0 || state->nKeys == 1)
3555                         return compare;
3556                 /* Compare additional columns the hard way */
3557                 sortKey++;
3558                 nkey = 1;
3559         }
3560         else
3561         {
3562                 /* Must compare all keys the hard way */
3563                 nkey = 0;
3564         }
3565
3566         if (state->indexInfo->ii_Expressions == NULL)
3567         {
3568                 /* If not expression index, just compare the proper heap attrs */
3569
3570                 for (; nkey < state->nKeys; nkey++, sortKey++)
3571                 {
3572                         AttrNumber      attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
3573
3574                         datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3575                         datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3576
3577                         compare = ApplySortComparator(datum1, isnull1,
3578                                                                                   datum2, isnull2,
3579                                                                                   sortKey);
3580                         if (compare != 0)
3581                                 return compare;
3582                 }
3583         }
3584         else
3585         {
3586                 /*
3587                  * In the expression index case, compute the whole index tuple and
3588                  * then compare values.  It would perhaps be faster to compute only as
3589                  * many columns as we need to compare, but that would require
3590                  * duplicating all the logic in FormIndexDatum.
3591                  */
3592                 Datum           l_index_values[INDEX_MAX_KEYS];
3593                 bool            l_index_isnull[INDEX_MAX_KEYS];
3594                 Datum           r_index_values[INDEX_MAX_KEYS];
3595                 bool            r_index_isnull[INDEX_MAX_KEYS];
3596                 TupleTableSlot *ecxt_scantuple;
3597
3598                 /* Reset context each time to prevent memory leakage */
3599                 ResetPerTupleExprContext(state->estate);
3600
3601                 ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
3602
3603                 ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
3604                 FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3605                                            l_index_values, l_index_isnull);
3606
3607                 ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
3608                 FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3609                                            r_index_values, r_index_isnull);
3610
3611                 for (; nkey < state->nKeys; nkey++, sortKey++)
3612                 {
3613                         compare = ApplySortComparator(l_index_values[nkey],
3614                                                                                   l_index_isnull[nkey],
3615                                                                                   r_index_values[nkey],
3616                                                                                   r_index_isnull[nkey],
3617                                                                                   sortKey);
3618                         if (compare != 0)
3619                                 return compare;
3620                 }
3621         }
3622
3623         return 0;
3624 }
3625
3626 static void
3627 copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
3628 {
3629         HeapTuple       tuple = (HeapTuple) tup;
3630         Datum           original;
3631         MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3632
3633         /* copy the tuple into sort storage */
3634         tuple = heap_copytuple(tuple);
3635         stup->tuple = (void *) tuple;
3636         USEMEM(state, GetMemoryChunkSpace(tuple));
3637
3638         MemoryContextSwitchTo(oldcontext);
3639
3640         /*
3641          * set up first-column key value, and potentially abbreviate, if it's a
3642          * simple column
3643          */
3644         if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
3645                 return;
3646
3647         original = heap_getattr(tuple,
3648                                                         state->indexInfo->ii_KeyAttrNumbers[0],
3649                                                         state->tupDesc,
3650                                                         &stup->isnull1);
3651
3652         if (!state->sortKeys->abbrev_converter || stup->isnull1)
3653         {
3654                 /*
3655                  * Store ordinary Datum representation, or NULL value.  If there is a
3656                  * converter it won't expect NULL values, and cost model is not
3657                  * required to account for NULL, so in that case we avoid calling
3658                  * converter and just set datum1 to zeroed representation (to be
3659                  * consistent, and to support cheap inequality tests for NULL
3660                  * abbreviated keys).
3661                  */
3662                 stup->datum1 = original;
3663         }
3664         else if (!consider_abort_common(state))
3665         {
3666                 /* Store abbreviated key representation */
3667                 stup->datum1 = state->sortKeys->abbrev_converter(original,
3668                                                                                                                  state->sortKeys);
3669         }
3670         else
3671         {
3672                 /* Abort abbreviation */
3673                 int                     i;
3674
3675                 stup->datum1 = original;
3676
3677                 /*
3678                  * Set state to be consistent with never trying abbreviation.
3679                  *
3680                  * Alter datum1 representation in already-copied tuples, so as to
3681                  * ensure a consistent representation (current tuple was just
3682                  * handled).  It does not matter if some dumped tuples are already
3683                  * sorted on tape, since serialized tuples lack abbreviated keys
3684                  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3685                  */
3686                 for (i = 0; i < state->memtupcount; i++)
3687                 {
3688                         SortTuple  *mtup = &state->memtuples[i];
3689
3690                         tuple = (HeapTuple) mtup->tuple;
3691                         mtup->datum1 = heap_getattr(tuple,
3692                                                                                 state->indexInfo->ii_KeyAttrNumbers[0],
3693                                                                                 state->tupDesc,
3694                                                                                 &mtup->isnull1);
3695                 }
3696         }
3697 }
3698
3699 static void
3700 writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
3701 {
3702         HeapTuple       tuple = (HeapTuple) stup->tuple;
3703         unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
3704
3705         /* We need to store t_self, but not other fields of HeapTupleData */
3706         LogicalTapeWrite(state->tapeset, tapenum,
3707                                          &tuplen, sizeof(tuplen));
3708         LogicalTapeWrite(state->tapeset, tapenum,
3709                                          &tuple->t_self, sizeof(ItemPointerData));
3710         LogicalTapeWrite(state->tapeset, tapenum,
3711                                          tuple->t_data, tuple->t_len);
3712         if (state->randomAccess)        /* need trailing length word? */
3713                 LogicalTapeWrite(state->tapeset, tapenum,
3714                                                  &tuplen, sizeof(tuplen));
3715
3716         if (!state->slabAllocatorUsed)
3717         {
3718                 FREEMEM(state, GetMemoryChunkSpace(tuple));
3719                 heap_freetuple(tuple);
3720         }
3721 }
3722
3723 static void
3724 readtup_cluster(Tuplesortstate *state, SortTuple *stup,
3725                                 int tapenum, unsigned int tuplen)
3726 {
3727         unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
3728         HeapTuple       tuple = (HeapTuple) readtup_alloc(state,
3729                                                                                                   t_len + HEAPTUPLESIZE);
3730
3731         /* Reconstruct the HeapTupleData header */
3732         tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
3733         tuple->t_len = t_len;
3734         LogicalTapeReadExact(state->tapeset, tapenum,
3735                                                  &tuple->t_self, sizeof(ItemPointerData));
3736         /* We don't currently bother to reconstruct t_tableOid */
3737         tuple->t_tableOid = InvalidOid;
3738         /* Read in the tuple body */
3739         LogicalTapeReadExact(state->tapeset, tapenum,
3740                                                  tuple->t_data, tuple->t_len);
3741         if (state->randomAccess)        /* need trailing length word? */
3742                 LogicalTapeReadExact(state->tapeset, tapenum,
3743                                                          &tuplen, sizeof(tuplen));
3744         stup->tuple = (void *) tuple;
3745         /* set up first-column key value, if it's a simple column */
3746         if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
3747                 stup->datum1 = heap_getattr(tuple,
3748                                                                         state->indexInfo->ii_KeyAttrNumbers[0],
3749                                                                         state->tupDesc,
3750                                                                         &stup->isnull1);
3751 }
3752
3753 /*
3754  * Routines specialized for IndexTuple case
3755  *
3756  * The btree and hash cases require separate comparison functions, but the
3757  * IndexTuple representation is the same so the copy/write/read support
3758  * functions can be shared.
3759  */
3760
3761 static int
3762 comparetup_index_btree(const SortTuple *a, const SortTuple *b,
3763                                            Tuplesortstate *state)
3764 {
3765         /*
3766          * This is similar to comparetup_heap(), but expects index tuples.  There
3767          * is also special handling for enforcing uniqueness, and special
3768          * treatment for equal keys at the end.
3769          */
3770         SortSupport sortKey = state->sortKeys;
3771         IndexTuple      tuple1;
3772         IndexTuple      tuple2;
3773         int                     keysz;
3774         TupleDesc       tupDes;
3775         bool            equal_hasnull = false;
3776         int                     nkey;
3777         int32           compare;
3778         Datum           datum1,
3779                                 datum2;
3780         bool            isnull1,
3781                                 isnull2;
3782
3783
3784         /* Compare the leading sort key */
3785         compare = ApplySortComparator(a->datum1, a->isnull1,
3786                                                                   b->datum1, b->isnull1,
3787                                                                   sortKey);
3788         if (compare != 0)
3789                 return compare;
3790
3791         /* Compare additional sort keys */
3792         tuple1 = (IndexTuple) a->tuple;
3793         tuple2 = (IndexTuple) b->tuple;
3794         keysz = state->nKeys;
3795         tupDes = RelationGetDescr(state->indexRel);
3796
3797         if (sortKey->abbrev_converter)
3798         {
3799                 datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
3800                 datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
3801
3802                 compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3803                                                                                                 datum2, isnull2,
3804                                                                                                 sortKey);
3805                 if (compare != 0)
3806                         return compare;
3807         }
3808
3809         /* they are equal, so we only need to examine one null flag */
3810         if (a->isnull1)
3811                 equal_hasnull = true;
3812
3813         sortKey++;
3814         for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
3815         {
3816                 datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
3817                 datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
3818
3819                 compare = ApplySortComparator(datum1, isnull1,
3820                                                                           datum2, isnull2,
3821                                                                           sortKey);
3822                 if (compare != 0)
3823                         return compare;         /* done when we find unequal attributes */
3824
3825                 /* they are equal, so we only need to examine one null flag */
3826                 if (isnull1)
3827                         equal_hasnull = true;
3828         }
3829
3830         /*
3831          * If btree has asked us to enforce uniqueness, complain if two equal
3832          * tuples are detected (unless there was at least one NULL field).
3833          *
3834          * It is sufficient to make the test here, because if two tuples are equal
3835          * they *must* get compared at some stage of the sort --- otherwise the
3836          * sort algorithm wouldn't have checked whether one must appear before the
3837          * other.
3838          */
3839         if (state->enforceUnique && !equal_hasnull)
3840         {
3841                 Datum           values[INDEX_MAX_KEYS];
3842                 bool            isnull[INDEX_MAX_KEYS];
3843                 char       *key_desc;
3844
3845                 /*
3846                  * Some rather brain-dead implementations of qsort (such as the one in
3847                  * QNX 4) will sometimes call the comparison routine to compare a
3848                  * value to itself, but we always use our own implementation, which
3849                  * does not.
3850                  */
3851                 Assert(tuple1 != tuple2);
3852
3853                 index_deform_tuple(tuple1, tupDes, values, isnull);
3854
3855                 key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
3856
3857                 ereport(ERROR,
3858                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
3859                                  errmsg("could not create unique index \"%s\"",
3860                                                 RelationGetRelationName(state->indexRel)),
3861                                  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
3862                                  errdetail("Duplicate keys exist."),
3863                                  errtableconstraint(state->heapRel,
3864                                                                         RelationGetRelationName(state->indexRel))));
3865         }
3866
3867         /*
3868          * If key values are equal, we sort on ItemPointer.  This does not affect
3869          * validity of the finished index, but it may be useful to have index
3870          * scans in physical order.
3871          */
3872         {
3873                 BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
3874                 BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
3875
3876                 if (blk1 != blk2)
3877                         return (blk1 < blk2) ? -1 : 1;
3878         }
3879         {
3880                 OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
3881                 OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
3882
3883                 if (pos1 != pos2)
3884                         return (pos1 < pos2) ? -1 : 1;
3885         }
3886
3887         return 0;
3888 }
3889
3890 static int
3891 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
3892                                           Tuplesortstate *state)
3893 {
3894         Bucket          bucket1;
3895         Bucket          bucket2;
3896         IndexTuple      tuple1;
3897         IndexTuple      tuple2;
3898
3899         /*
3900          * Fetch hash keys and mask off bits we don't want to sort by. We know
3901          * that the first column of the index tuple is the hash key.
3902          */
3903         Assert(!a->isnull1);
3904         bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
3905                                                                    state->max_buckets, state->high_mask,
3906                                                                    state->low_mask);
3907         Assert(!b->isnull1);
3908         bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
3909                                                                    state->max_buckets, state->high_mask,
3910                                                                    state->low_mask);
3911         if (bucket1 > bucket2)
3912                 return 1;
3913         else if (bucket1 < bucket2)
3914                 return -1;
3915
3916         /*
3917          * If hash values are equal, we sort on ItemPointer.  This does not affect
3918          * validity of the finished index, but it may be useful to have index
3919          * scans in physical order.
3920          */
3921         tuple1 = (IndexTuple) a->tuple;
3922         tuple2 = (IndexTuple) b->tuple;
3923
3924         {
3925                 BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
3926                 BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
3927
3928                 if (blk1 != blk2)
3929                         return (blk1 < blk2) ? -1 : 1;
3930         }
3931         {
3932                 OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
3933                 OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
3934
3935                 if (pos1 != pos2)
3936                         return (pos1 < pos2) ? -1 : 1;
3937         }
3938
3939         return 0;
3940 }
3941
3942 static void
3943 copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
3944 {
3945         IndexTuple      tuple = (IndexTuple) tup;
3946         unsigned int tuplen = IndexTupleSize(tuple);
3947         IndexTuple      newtuple;
3948         Datum           original;
3949
3950         /* copy the tuple into sort storage */
3951         newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen);
3952         memcpy(newtuple, tuple, tuplen);
3953         USEMEM(state, GetMemoryChunkSpace(newtuple));
3954         stup->tuple = (void *) newtuple;
3955         /* set up first-column key value */
3956         original = index_getattr(newtuple,
3957                                                          1,
3958                                                          RelationGetDescr(state->indexRel),
3959                                                          &stup->isnull1);
3960
3961         if (!state->sortKeys->abbrev_converter || stup->isnull1)
3962         {
3963                 /*
3964                  * Store ordinary Datum representation, or NULL value.  If there is a
3965                  * converter it won't expect NULL values, and cost model is not
3966                  * required to account for NULL, so in that case we avoid calling
3967                  * converter and just set datum1 to zeroed representation (to be
3968                  * consistent, and to support cheap inequality tests for NULL
3969                  * abbreviated keys).
3970                  */
3971                 stup->datum1 = original;
3972         }
3973         else if (!consider_abort_common(state))
3974         {
3975                 /* Store abbreviated key representation */
3976                 stup->datum1 = state->sortKeys->abbrev_converter(original,
3977                                                                                                                  state->sortKeys);
3978         }
3979         else
3980         {
3981                 /* Abort abbreviation */
3982                 int                     i;
3983
3984                 stup->datum1 = original;
3985
3986                 /*
3987                  * Set state to be consistent with never trying abbreviation.
3988                  *
3989                  * Alter datum1 representation in already-copied tuples, so as to
3990                  * ensure a consistent representation (current tuple was just
3991                  * handled).  It does not matter if some dumped tuples are already
3992                  * sorted on tape, since serialized tuples lack abbreviated keys
3993                  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3994                  */
3995                 for (i = 0; i < state->memtupcount; i++)
3996                 {
3997                         SortTuple  *mtup = &state->memtuples[i];
3998
3999                         tuple = (IndexTuple) mtup->tuple;
4000                         mtup->datum1 = index_getattr(tuple,
4001                                                                                  1,
4002                                                                                  RelationGetDescr(state->indexRel),
4003                                                                                  &mtup->isnull1);
4004                 }
4005         }
4006 }
4007
4008 static void
4009 writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
4010 {
4011         IndexTuple      tuple = (IndexTuple) stup->tuple;
4012         unsigned int tuplen;
4013
4014         tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4015         LogicalTapeWrite(state->tapeset, tapenum,
4016                                          (void *) &tuplen, sizeof(tuplen));
4017         LogicalTapeWrite(state->tapeset, tapenum,
4018                                          (void *) tuple, IndexTupleSize(tuple));
4019         if (state->randomAccess)        /* need trailing length word? */
4020                 LogicalTapeWrite(state->tapeset, tapenum,
4021                                                  (void *) &tuplen, sizeof(tuplen));
4022
4023         if (!state->slabAllocatorUsed)
4024         {
4025                 FREEMEM(state, GetMemoryChunkSpace(tuple));
4026                 pfree(tuple);
4027         }
4028 }
4029
4030 static void
4031 readtup_index(Tuplesortstate *state, SortTuple *stup,
4032                           int tapenum, unsigned int len)
4033 {
4034         unsigned int tuplen = len - sizeof(unsigned int);
4035         IndexTuple      tuple = (IndexTuple) readtup_alloc(state, tuplen);
4036
4037         LogicalTapeReadExact(state->tapeset, tapenum,
4038                                                  tuple, tuplen);
4039         if (state->randomAccess)        /* need trailing length word? */
4040                 LogicalTapeReadExact(state->tapeset, tapenum,
4041                                                          &tuplen, sizeof(tuplen));
4042         stup->tuple = (void *) tuple;
4043         /* set up first-column key value */
4044         stup->datum1 = index_getattr(tuple,
4045                                                                  1,
4046                                                                  RelationGetDescr(state->indexRel),
4047                                                                  &stup->isnull1);
4048 }
4049
4050 /*
4051  * Routines specialized for DatumTuple case
4052  */
4053
4054 static int
4055 comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
4056 {
4057         int                     compare;
4058
4059         compare = ApplySortComparator(a->datum1, a->isnull1,
4060                                                                   b->datum1, b->isnull1,
4061                                                                   state->sortKeys);
4062         if (compare != 0)
4063                 return compare;
4064
4065         /* if we have abbreviations, then "tuple" has the original value */
4066
4067         if (state->sortKeys->abbrev_converter)
4068                 compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1,
4069                                                                                                 PointerGetDatum(b->tuple), b->isnull1,
4070                                                                                                 state->sortKeys);
4071
4072         return compare;
4073 }
4074
4075 static void
4076 copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
4077 {
4078         /* Not currently needed */
4079         elog(ERROR, "copytup_datum() should not be called");
4080 }
4081
4082 static void
4083 writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
4084 {
4085         void       *waddr;
4086         unsigned int tuplen;
4087         unsigned int writtenlen;
4088
4089         if (stup->isnull1)
4090         {
4091                 waddr = NULL;
4092                 tuplen = 0;
4093         }
4094         else if (!state->tuples)
4095         {
4096                 waddr = &stup->datum1;
4097                 tuplen = sizeof(Datum);
4098         }
4099         else
4100         {
4101                 waddr = stup->tuple;
4102                 tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4103                 Assert(tuplen != 0);
4104         }
4105
4106         writtenlen = tuplen + sizeof(unsigned int);
4107
4108         LogicalTapeWrite(state->tapeset, tapenum,
4109                                          (void *) &writtenlen, sizeof(writtenlen));
4110         LogicalTapeWrite(state->tapeset, tapenum,
4111                                          waddr, tuplen);
4112         if (state->randomAccess)        /* need trailing length word? */
4113                 LogicalTapeWrite(state->tapeset, tapenum,
4114                                                  (void *) &writtenlen, sizeof(writtenlen));
4115
4116         if (!state->slabAllocatorUsed && stup->tuple)
4117         {
4118                 FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4119                 pfree(stup->tuple);
4120         }
4121 }
4122
4123 static void
4124 readtup_datum(Tuplesortstate *state, SortTuple *stup,
4125                           int tapenum, unsigned int len)
4126 {
4127         unsigned int tuplen = len - sizeof(unsigned int);
4128
4129         if (tuplen == 0)
4130         {
4131                 /* it's NULL */
4132                 stup->datum1 = (Datum) 0;
4133                 stup->isnull1 = true;
4134                 stup->tuple = NULL;
4135         }
4136         else if (!state->tuples)
4137         {
4138                 Assert(tuplen == sizeof(Datum));
4139                 LogicalTapeReadExact(state->tapeset, tapenum,
4140                                                          &stup->datum1, tuplen);
4141                 stup->isnull1 = false;
4142                 stup->tuple = NULL;
4143         }
4144         else
4145         {
4146                 void       *raddr = readtup_alloc(state, tuplen);
4147
4148                 LogicalTapeReadExact(state->tapeset, tapenum,
4149                                                          raddr, tuplen);
4150                 stup->datum1 = PointerGetDatum(raddr);
4151                 stup->isnull1 = false;
4152                 stup->tuple = raddr;
4153         }
4154
4155         if (state->randomAccess)        /* need trailing length word? */
4156                 LogicalTapeReadExact(state->tapeset, tapenum,
4157                                                          &tuplen, sizeof(tuplen));
4158 }
4159
4160 /*
4161  * Convenience routine to free a tuple previously loaded into sort memory
4162  */
4163 static void
4164 free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
4165 {
4166         FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4167         pfree(stup->tuple);
4168 }