Remove replacement selection sort.

author Robert Haas <rhaas@postgresql.org>

Fri, 29 Sep 2017 14:20:44 +0000 (10:20 -0400)

committer Robert Haas <rhaas@postgresql.org>

Fri, 29 Sep 2017 14:25:44 +0000 (10:25 -0400)
author Robert Haas <rhaas@postgresql.org>
Fri, 29 Sep 2017 14:20:44 +0000 (10:20 -0400)
committer Robert Haas <rhaas@postgresql.org>
Fri, 29 Sep 2017 14:25:44 +0000 (10:25 -0400)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 4b265d9e40c5093829c4aedbda34c1fe19edbac5..c13f60230f9a80904f43705e41637e8bd3d88377 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1505,45 +1505,6 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
  
-     <varlistentry id="guc-replacement-sort-tuples" xreflabel="replacement_sort_tuples">
-      <term><varname>replacement_sort_tuples</varname> (<type>integer</type>)
-      <indexterm>
-       <primary><varname>replacement_sort_tuples</> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        When the number of tuples to be sorted is smaller than this number,
-        a sort will produce its first output run using replacement selection
-        rather than quicksort.  This may be useful in memory-constrained
-        environments where tuples that are input into larger sort operations
-        have a strong physical-to-logical correlation.  Note that this does
-        not include input tuples with an <emphasis>inverse</emphasis>
-        correlation.  It is possible for the replacement selection algorithm
-        to generate one long run that requires no merging, where use of the
-        default strategy would result in many runs that must be merged
-        to produce a final sorted output.  This may allow sort
-        operations to complete sooner.
-       </para>
-       <para>
-        The default is 150,000 tuples.  Note that higher values are typically
-        not much more effective, and may be counter-productive, since the
-        priority queue is sensitive to the size of available CPU cache, whereas
-        the default strategy sorts runs using a <firstterm>cache
-        oblivious</firstterm> algorithm.  This property allows the default sort
-        strategy to automatically and transparently make effective use
-        of available CPU cache.
-       </para>
-       <para>
-        Setting <varname>maintenance_work_mem</varname> to its default
-        value usually prevents utility command external sorts (e.g.,
-        sorts used by <command>CREATE INDEX</> to build B-Tree
-        indexes) from ever using replacement selection sort, unless the
-        input tuples are quite wide.
-       </para>
-      </listitem>
-     </varlistentry>
-
       <varlistentry id="guc-autovacuum-work-mem" xreflabel="autovacuum_work_mem">
        <term><varname>autovacuum_work_mem</varname> (<type>integer</type>)
        <indexterm>
diff --git a/doc/src/sgml/release-9.6.sgml b/doc/src/sgml/release-9.6.sgml

index dc811c4ca5d9ba7cbb057a543b9c66c6cd34b429..09b6b902547c5a29e37c4235038466ac5a300980 100644 (file)
--- a/doc/src/sgml/release-9.6.sgml
+++ b/doc/src/sgml/release-9.6.sgml
@@ -5140,7 +5140,7 @@ and many others in the same vein
          The new approach makes better use of the <acronym>CPU</> cache
          for typical cache sizes and data volumes.  Where necessary,
          the behavior can be adjusted via the new configuration parameter
-        <xref linkend="guc-replacement-sort-tuples">.
+        <literal>replacement_sort_tuples</literal>.
         </para>
        </listitem>
  
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c

index 7c09498dc047ebc34403a3edc85d147ef293d82e..9680a4b0f7a9edfb45ac4ab6a1880df28ab4052f 100644 (file)
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -112,7 +112,6 @@ bool                enableFsync = true;
  bool           allowSystemTableMods = false;
  int                    work_mem = 1024;
  int                    maintenance_work_mem = 16384;
-int                    replacement_sort_tuples = 150000;
  
  /*
   * Primary determinants of sizes of shared-memory structures.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 47a5f257071caf9f849985cd131412c283be1a2e..8292df00bbfac0de6ae97e7ef2e68323af8ed7ef 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1933,16 +1933,6 @@ static struct config_int ConfigureNamesInt[] =
                 NULL, NULL, NULL
         },
  
-       {
-               {"replacement_sort_tuples", PGC_USERSET, RESOURCES_MEM,
-                       gettext_noop("Sets the maximum number of tuples to be sorted using replacement selection."),
-                       gettext_noop("When more tuples than this are present, quicksort will be used.")
-               },
-               &replacement_sort_tuples,
-               150000, 0, INT_MAX,
-               NULL, NULL, NULL
-       },
-
         /*
          * We use the hopefully-safely-small value of 100kB as the compiled-in
          * default for max_stack_depth.  InitializeGUCOptions will increase it if
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 8ba6b1d08a519c5b8b70671965464adf09cb574e..cf4ddcd94a7c079db52ff67d67393dd3fc54f7b5 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -121,7 +121,6 @@
  # you actively intend to use prepared transactions.
  #work_mem = 4MB                                # min 64kB
  #maintenance_work_mem = 64MB           # min 1MB
-#replacement_sort_tuples = 150000      # limits use of replacement selection sort
  #autovacuum_work_mem = -1              # min 1MB, or -1 to use maintenance_work_mem
  #max_stack_depth = 2MB                 # min 100kB
  #dynamic_shared_memory_type = posix    # the default is the first option
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index 17e1b6860ba068b815a10a3c0fa576c6b6615fb0..60522cb4424749eaeb9b6b2bc04bb767a1f386d6 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -13,47 +13,11 @@
   * See Knuth, volume 3, for more than you want to know about the external
   * sorting algorithm.  Historically, we divided the input into sorted runs
   * using replacement selection, in the form of a priority tree implemented
- * as a heap (essentially his Algorithm 5.2.3H), but now we only do that
- * for the first run, and only if the run would otherwise end up being very
- * short.  We merge the runs using polyphase merge, Knuth's Algorithm
- * 5.4.2D.  The logical "tapes" used by Algorithm D are implemented by
- * logtape.c, which avoids space wastage by recycling disk space as soon
- * as each block is read from its "tape".
- *
- * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for
- * the replacement selection, because it uses a fixed number of records
- * in memory at all times.  Since we are dealing with tuples that may vary
- * considerably in size, we want to be able to vary the number of records
- * kept in memory to ensure full utilization of the allowed sort memory
- * space.  So, we keep the tuples in a variable-size heap, with the next
- * record to go out at the top of the heap.  Like Algorithm 5.4.1R, each
- * record is stored with the run number that it must go into, and we use
- * (run number, key) as the ordering key for the heap.  When the run number
- * at the top of the heap changes, we know that no more records of the prior
- * run are left in the heap.  Note that there are in practice only ever two
- * distinct run numbers, because since PostgreSQL 9.6, we only use
- * replacement selection to form the first run.
- *
- * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small
- * customizations) is only used with the aim of producing just one run,
- * thereby avoiding all merging.  Only the first run can use replacement
- * selection, which is why there are now only two possible valid run
- * numbers, and why heapification is customized to not distinguish between
- * tuples in the second run (those will be quicksorted).  We generally
- * prefer a simple hybrid sort-merge strategy, where runs are sorted in much
- * the same way as the entire input of an internal sort is sorted (using
- * qsort()).  The replacement_sort_tuples GUC controls the limited remaining
- * use of replacement selection for the first run.
- *
- * There are several reasons to favor a hybrid sort-merge strategy.
- * Maintaining a priority tree/heap has poor CPU cache characteristics.
- * Furthermore, the growth in main memory sizes has greatly diminished the
- * value of having runs that are larger than available memory, even in the
- * case where there is partially sorted input and runs can be made far
- * larger by using a heap.  In most cases, a single-pass merge step is all
- * that is required even when runs are no larger than available memory.
- * Avoiding multiple merge passes was traditionally considered to be the
- * major advantage of using replacement selection.
+ * as a heap (essentially his Algorithm 5.2.3H), but now we always use
+ * quicksort for run generation.  We merge the runs using polyphase merge,
+ * Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D are
+ * implemented by logtape.c, which avoids space wastage by recycling disk
+ * space as soon as each block is read from its "tape".
   *
   * The approximate amount of memory allowed for any one sort operation
   * is specified in kilobytes by the caller (most pass work_mem).  Initially,
@@ -64,9 +28,8 @@
   * workMem, we begin to emit tuples into sorted runs in temporary tapes.
   * When tuples are dumped in batch after quicksorting, we begin a new run
   * with a new output tape (selected per Algorithm D).  After the end of the
- * input is reached, we dump out remaining tuples in memory into a final run
- * (or two, when replacement selection is still used), then merge the runs
- * using Algorithm D.
+ * input is reached, we dump out remaining tuples in memory into a final run,
+ * then merge the runs using Algorithm D.
   *
   * When merging runs, we use a heap containing just the frontmost tuple from
   * each source run; we repeatedly output the smallest tuple and replace it
@@ -188,13 +151,8 @@ bool               optimize_bounded_sort = true;
   * described above.  Accordingly, "tuple" is always used in preference to
   * datum1 as the authoritative value for pass-by-reference cases.
   *
- * While building initial runs, tupindex holds the tuple's run number.
- * Historically, the run number could meaningfully distinguish many runs, but
- * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
- * selection is always abandoned after the first run; no other run number
- * should be represented here.  During merge passes, we re-use it to hold the
- * input tape number that each tuple in the heap was read from.  tupindex goes
- * unused if the sort occurs entirely in memory.
+ * tupindex holds the input tape number that each tuple in the heap was read
+ * from during merge passes.
   */
  typedef struct
  {
@@ -253,15 +211,6 @@ typedef enum
  #define TAPE_BUFFER_OVERHEAD           BLCKSZ
  #define MERGE_BUFFER_SIZE                      (BLCKSZ * 32)
  
- /*
-  * Run numbers, used during external sort operations.
-  *
-  * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun.
-  */
-#define RUN_FIRST              0
-#define HEAP_RUN_NEXT  INT_MAX
-#define RUN_SECOND             1
-
  typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
                                                                         Tuplesortstate *state);
  
@@ -381,16 +330,8 @@ struct Tuplesortstate
         void       *lastReturnedTuple;
  
         /*
-        * While building initial runs, this indicates if the replacement
-        * selection strategy is in use.  When it isn't, then a simple hybrid
-        * sort-merge strategy is in use instead (runs are quicksorted).
-        */
-       bool            replaceActive;
-
-       /*
-        * While building initial runs, this is the current output run number
-        * (starting at RUN_FIRST).  Afterwards, it is the number of initial runs
-        * we made.
+        * While building initial runs, this is the current output run number.
+        * Afterwards, it is the number of initial runs we made.
          */
         int                     currentRun;
  
@@ -583,7 +524,6 @@ struct Tuplesortstate
  static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
  static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
  static bool consider_abort_common(Tuplesortstate *state);
-static bool useselection(Tuplesortstate *state);
  static void inittapes(Tuplesortstate *state);
  static void selectnewtape(Tuplesortstate *state);
  static void init_slab_allocator(Tuplesortstate *state, int numSlots);
@@ -592,15 +532,12 @@ static void mergeonerun(Tuplesortstate *state);
  static void beginmerge(Tuplesortstate *state);
  static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
  static void dumptuples(Tuplesortstate *state, bool alltuples);
-static void dumpbatch(Tuplesortstate *state, bool alltuples);
  static void make_bounded_heap(Tuplesortstate *state);
  static void sort_bounded_heap(Tuplesortstate *state);
  static void tuplesort_sort_memtuples(Tuplesortstate *state);
-static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
-                                         bool checkIndex);
-static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
-                                                  bool checkIndex);
-static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex);
+static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple);
+static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
+static void tuplesort_heap_delete_top(Tuplesortstate *state);
  static void reversedirection(Tuplesortstate *state);
  static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
  static void markrunend(Tuplesortstate *state, int tapenum);
@@ -738,7 +675,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
         if (LACKMEM(state))
                 elog(ERROR, "insufficient memory allowed for sort");
  
-       state->currentRun = RUN_FIRST;
+       state->currentRun = 0;
  
         /*
          * maxTapes, tapeRange, and Algorithm D variables will be initialized by
@@ -1622,7 +1559,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
                         inittapes(state);
  
                         /*
-                        * Dump tuples until we are back under the limit.
+                        * Dump all tuples.
                          */
                         dumptuples(state, false);
                         break;
@@ -1647,74 +1584,20 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
                         {
                                 /* discard top of heap, replacing it with the new tuple */
                                 free_sort_tuple(state, &state->memtuples[0]);
-                               tuple->tupindex = 0;    /* not used */
-                               tuplesort_heap_replace_top(state, tuple, false);
+                               tuplesort_heap_replace_top(state, tuple);
                         }
                         break;
  
                 case TSS_BUILDRUNS:
  
                         /*
-                        * Insert the tuple into the heap, with run number currentRun if
-                        * it can go into the current run, else HEAP_RUN_NEXT.  The tuple
-                        * can go into the current run if it is >= the first
-                        * not-yet-output tuple.  (Actually, it could go into the current
-                        * run if it is >= the most recently output tuple ... but that
-                        * would require keeping around the tuple we last output, and it's
-                        * simplest to let writetup free each tuple as soon as it's
-                        * written.)
-                        *
-                        * Note that this only applies when:
-                        *
-                        * - currentRun is RUN_FIRST
-                        *
-                        * - Replacement selection is in use (typically it is never used).
-                        *
-                        * When these two conditions are not both true, all tuples are
-                        * appended indifferently, much like the TSS_INITIAL case.
-                        *
-                        * There should always be room to store the incoming tuple.
+                        * Save the tuple into the unsorted array (there must be
+                        * space)
                          */
-                       Assert(!state->replaceActive || state->memtupcount > 0);
-                       if (state->replaceActive &&
-                               COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
-                       {
-                               Assert(state->currentRun == RUN_FIRST);
-
-                               /*
-                                * Insert tuple into first, fully heapified run.
-                                *
-                                * Unlike classic replacement selection, which this module was
-                                * previously based on, only RUN_FIRST tuples are fully
-                                * heapified.  Any second/next run tuples are appended
-                                * indifferently.  While HEAP_RUN_NEXT tuples may be sifted
-                                * out of the way of first run tuples, COMPARETUP() will never
-                                * be called for the run's tuples during sifting (only our
-                                * initial COMPARETUP() call is required for the tuple, to
-                                * determine that the tuple does not belong in RUN_FIRST).
-                                */
-                               tuple->tupindex = state->currentRun;
-                               tuplesort_heap_insert(state, tuple, true);
-                       }
-                       else
-                       {
-                               /*
-                                * Tuple was determined to not belong to heapified RUN_FIRST,
-                                * or replacement selection not in play.  Append the tuple to
-                                * memtuples indifferently.
-                                *
-                                * dumptuples() does not trust that the next run's tuples are
-                                * heapified.  Anything past the first run will always be
-                                * quicksorted even when replacement selection is initially
-                                * used.  (When it's never used, every tuple still takes this
-                                * path.)
-                                */
-                               tuple->tupindex = HEAP_RUN_NEXT;
-                               state->memtuples[state->memtupcount++] = *tuple;
-                       }
+                       state->memtuples[state->memtupcount++] = *tuple;
  
                         /*
-                        * If we are over the memory limit, dump tuples till we're under.
+                        * If we are over the memory limit, dump all tuples.
                          */
                         dumptuples(state, false);
                         break;
@@ -2068,7 +1951,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
                                          * If no more data, we've reached end of run on this tape.
                                          * Remove the top node from the heap.
                                          */
-                                       tuplesort_heap_delete_top(state, false);
+                                       tuplesort_heap_delete_top(state);
  
                                         /*
                                          * Rewind to free the read buffer.  It'd go away at the
@@ -2079,7 +1962,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
                                         return true;
                                 }
                                 newtup.tupindex = srcTape;
-                               tuplesort_heap_replace_top(state, &newtup, false);
+                               tuplesort_heap_replace_top(state, &newtup);
                                 return true;
                         }
                         return false;
@@ -2336,28 +2219,6 @@ tuplesort_merge_order(int64 allowedMem)
         return mOrder;
  }
  
-/*
- * useselection - determine algorithm to use to sort first run.
- *
- * It can sometimes be useful to use the replacement selection algorithm if it
- * results in one large run, and there is little available workMem.  See
- * remarks on RUN_SECOND optimization within dumptuples().
- */
-static bool
-useselection(Tuplesortstate *state)
-{
-       /*
-        * memtupsize might be noticeably higher than memtupcount here in atypical
-        * cases.  It seems slightly preferable to not allow recent outliers to
-        * impact this determination.  Note that caller's trace_sort output
-        * reports memtupcount instead.
-        */
-       if (state->memtupsize <= replacement_sort_tuples)
-               return true;
-
-       return false;
-}
-
  /*
   * inittapes - initialize for tape sorting.
   *
@@ -2413,44 +2274,7 @@ inittapes(Tuplesortstate *state)
         state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
         state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
  
-       /*
-        * Give replacement selection a try based on user setting.  There will be
-        * a switch to a simple hybrid sort-merge strategy after the first run
-        * (iff we could not output one long run).
-        */
-       state->replaceActive = useselection(state);
-
-       if (state->replaceActive)
-       {
-               /*
-                * Convert the unsorted contents of memtuples[] into a heap. Each
-                * tuple is marked as belonging to run number zero.
-                *
-                * NOTE: we pass false for checkIndex since there's no point in
-                * comparing indexes in this step, even though we do intend the
-                * indexes to be part of the sort key...
-                */
-               int                     ntuples = state->memtupcount;
-
-#ifdef TRACE_SORT
-               if (trace_sort)
-                       elog(LOG, "replacement selection will sort %d first run tuples",
-                                state->memtupcount);
-#endif
-               state->memtupcount = 0; /* make the heap empty */
-
-               for (j = 0; j < ntuples; j++)
-               {
-                       /* Must copy source tuple to avoid possible overwrite */
-                       SortTuple       stup = state->memtuples[j];
-
-                       stup.tupindex = RUN_FIRST;
-                       tuplesort_heap_insert(state, &stup, false);
-               }
-               Assert(state->memtupcount == ntuples);
-       }
-
-       state->currentRun = RUN_FIRST;
+       state->currentRun = 0;
  
         /*
          * Initialize variables of Algorithm D (step D1).
@@ -2624,22 +2448,6 @@ mergeruns(Tuplesortstate *state)
         else
                 init_slab_allocator(state, 0);
  
-       /*
-        * If we produced only one initial run (quite likely if the total data
-        * volume is between 1X and 2X workMem when replacement selection is used,
-        * but something we particularly count on when input is presorted), we can
-        * just use that tape as the finished output, rather than doing a useless
-        * merge.  (This obvious optimization is not in Knuth's algorithm.)
-        */
-       if (state->currentRun == RUN_SECOND)
-       {
-               state->result_tape = state->tp_tapenum[state->destTape];
-               /* must freeze and rewind the finished output tape */
-               LogicalTapeFreeze(state->tapeset, state->result_tape);
-               state->status = TSS_SORTEDONTAPE;
-               return;
-       }
-
         /*
          * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
          * from each input tape.
@@ -2826,11 +2634,11 @@ mergeonerun(Tuplesortstate *state)
                 if (mergereadnext(state, srcTape, &stup))
                 {
                         stup.tupindex = srcTape;
-                       tuplesort_heap_replace_top(state, &stup, false);
+                       tuplesort_heap_replace_top(state, &stup);
  
                 }
                 else
-                       tuplesort_heap_delete_top(state, false);
+                       tuplesort_heap_delete_top(state);
         }
  
         /*
@@ -2892,7 +2700,7 @@ beginmerge(Tuplesortstate *state)
                 if (mergereadnext(state, srcTape, &tup))
                 {
                         tup.tupindex = srcTape;
-                       tuplesort_heap_insert(state, &tup, false);
+                       tuplesort_heap_insert(state, &tup);
                 }
         }
  }
@@ -2922,124 +2730,25 @@ mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
  }
  
  /*
- * dumptuples - remove tuples from memtuples and write to tape
- *
- * This is used during initial-run building, but not during merging.
+ * dumptuples - remove tuples from memtuples and write initial run to tape
   *
- * When alltuples = false and replacement selection is still active, dump
- * only enough tuples to get under the availMem limit (and leave at least
- * one tuple in memtuples, since puttuple will then assume it is a heap that
- * has a tuple to compare to).  We always insist there be at least one free
- * slot in the memtuples[] array.
- *
- * When alltuples = true, dump everything currently in memory.  (This
- * case is only used at end of input data, although in practice only the
- * first run could fail to dump all tuples when we LACKMEM(), and only
- * when replacement selection is active.)
- *
- * If, when replacement selection is active, we see that the tuple run
- * number at the top of the heap has changed, start a new run.  This must be
- * the first run, because replacement selection is always abandoned for all
- * further runs.
+ * When alltuples = true, dump everything currently in memory.  (This case is
+ * only used at end of input data.)
   */
  static void
  dumptuples(Tuplesortstate *state, bool alltuples)
-{
-       while (alltuples ||
-                  (LACKMEM(state) && state->memtupcount > 1) ||
-                  state->memtupcount >= state->memtupsize)
-       {
-               if (state->replaceActive)
-               {
-                       /*
-                        * Still holding out for a case favorable to replacement
-                        * selection. Still incrementally spilling using heap.
-                        *
-                        * Dump the heap's frontmost entry, and remove it from the heap.
-                        */
-                       Assert(state->memtupcount > 0);
-                       WRITETUP(state, state->tp_tapenum[state->destTape],
-                                        &state->memtuples[0]);
-                       tuplesort_heap_delete_top(state, true);
-               }
-               else
-               {
-                       /*
-                        * Once committed to quicksorting runs, never incrementally spill
-                        */
-                       dumpbatch(state, alltuples);
-                       break;
-               }
-
-               /*
-                * If top run number has changed, we've finished the current run (this
-                * can only be the first run), and will no longer spill incrementally.
-                */
-               if (state->memtupcount == 0 ||
-                       state->memtuples[0].tupindex == HEAP_RUN_NEXT)
-               {
-                       markrunend(state, state->tp_tapenum[state->destTape]);
-                       Assert(state->currentRun == RUN_FIRST);
-                       state->currentRun++;
-                       state->tp_runs[state->destTape]++;
-                       state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
-
-#ifdef TRACE_SORT
-                       if (trace_sort)
-                               elog(LOG, "finished incrementally writing %s run %d to tape %d: %s",
-                                        (state->memtupcount == 0) ? "only" : "first",
-                                        state->currentRun, state->destTape,
-                                        pg_rusage_show(&state->ru_start));
-#endif
-
-                       /*
-                        * Done if heap is empty, which is possible when there is only one
-                        * long run.
-                        */
-                       Assert(state->currentRun == RUN_SECOND);
-                       if (state->memtupcount == 0)
-                       {
-                               /*
-                                * Replacement selection best case; no final merge required,
-                                * because there was only one initial run (second run has no
-                                * tuples).  See RUN_SECOND case in mergeruns().
-                                */
-                               break;
-                       }
-
-                       /*
-                        * Abandon replacement selection for second run (as well as any
-                        * subsequent runs).
-                        */
-                       state->replaceActive = false;
-
-                       /*
-                        * First tuple of next run should not be heapified, and so will
-                        * bear placeholder run number.  In practice this must actually be
-                        * the second run, which just became the currentRun, so we're
-                        * clear to quicksort and dump the tuples in batch next time
-                        * memtuples becomes full.
-                        */
-                       Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT);
-                       selectnewtape(state);
-               }
-       }
-}
-
-/*
- * dumpbatch - sort and dump all memtuples, forming one run on tape
- *
- * Second or subsequent runs are never heapified by this module (although
- * heapification still respects run number differences between the first and
- * second runs), and a heap (replacement selection priority queue) is often
- * avoided in the first place.
- */
-static void
-dumpbatch(Tuplesortstate *state, bool alltuples)
  {
         int                     memtupwrite;
         int                     i;
  
+       /*
+        * Nothing to do if we still fit in available memory and have array
+        * slots, unless this is the final call during initial run generation.
+        */
+       if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
+               !alltuples)
+               return;
+
         /*
          * Final call might require no sorting, in rare cases where we just so
          * happen to have previously LACKMEM()'d at the point where exactly all
@@ -3308,21 +3017,8 @@ tuplesort_space_type_name(TuplesortSpaceType t)
  
  /*
   * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
- *
- * Compare two SortTuples.  If checkIndex is true, use the tuple index
- * as the front of the sort key; otherwise, no.
- *
- * Note that for checkIndex callers, the heap invariant is never
- * maintained beyond the first run, and so there are no COMPARETUP()
- * calls needed to distinguish tuples in HEAP_RUN_NEXT.
   */
  
-#define HEAPCOMPARE(tup1,tup2) \
-       (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
-                                       (tup1)->tupindex == HEAP_RUN_NEXT) ? \
-        ((tup1)->tupindex) - ((tup2)->tupindex) : \
-        COMPARETUP(state, tup1, tup2))
-
  /*
   * Convert the existing unordered array of SortTuples to a bounded heap,
   * discarding all but the smallest "state->bound" tuples.
@@ -3331,10 +3027,6 @@ tuplesort_space_type_name(TuplesortSpaceType t)
   * at the root (array entry zero), instead of the smallest as in the normal
   * sort case.  This allows us to discard the largest entry cheaply.
   * Therefore, we temporarily reverse the sort direction.
- *
- * We assume that all entries in a bounded heap will always have tupindex
- * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
- * the direction of comparison for tupindexes.
   */
  static void
  make_bounded_heap(Tuplesortstate *state)
@@ -3358,8 +3050,7 @@ make_bounded_heap(Tuplesortstate *state)
                         /* Must copy source tuple to avoid possible overwrite */
                         SortTuple       stup = state->memtuples[i];
  
-                       stup.tupindex = 0;      /* not used */
-                       tuplesort_heap_insert(state, &stup, false);
+                       tuplesort_heap_insert(state, &stup);
                 }
                 else
                 {
@@ -3374,7 +3065,7 @@ make_bounded_heap(Tuplesortstate *state)
                                 CHECK_FOR_INTERRUPTS();
                         }
                         else
-                               tuplesort_heap_replace_top(state, &state->memtuples[i], false);
+                               tuplesort_heap_replace_top(state, &state->memtuples[i]);
                 }
         }
  
@@ -3404,7 +3095,7 @@ sort_bounded_heap(Tuplesortstate *state)
                 SortTuple       stup = state->memtuples[0];
  
                 /* this sifts-up the next-largest entry and decreases memtupcount */
-               tuplesort_heap_delete_top(state, false);
+               tuplesort_heap_delete_top(state);
                 state->memtuples[state->memtupcount] = stup;
         }
         state->memtupcount = tupcount;
@@ -3422,10 +3113,7 @@ sort_bounded_heap(Tuplesortstate *state)
  /*
   * Sort all memtuples using specialized qsort() routines.
   *
- * Quicksort is used for small in-memory sorts.  Quicksort is also generally
- * preferred to replacement selection for generating runs during external sort
- * operations, although replacement selection is sometimes used for the first
- * run.
+ * Quicksort is used for small in-memory sorts, and external sort runs.
   */
  static void
  tuplesort_sort_memtuples(Tuplesortstate *state)
@@ -3454,15 +3142,13 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
   * is, it might get overwritten before being moved into the heap!
   */
  static void
-tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
-                                         bool checkIndex)
+tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
  {
         SortTuple  *memtuples;
         int                     j;
  
         memtuples = state->memtuples;
         Assert(state->memtupcount < state->memtupsize);
-       Assert(!checkIndex || tuple->tupindex == RUN_FIRST);
  
         CHECK_FOR_INTERRUPTS();
  
@@ -3475,7 +3161,7 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
         {
                 int                     i = (j - 1) >> 1;
  
-               if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0)
+               if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
                         break;
                 memtuples[j] = memtuples[i];
                 j = i;
@@ -3491,12 +3177,11 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
   * if necessary.
   */
  static void
-tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex)
+tuplesort_heap_delete_top(Tuplesortstate *state)
  {
         SortTuple  *memtuples = state->memtuples;
         SortTuple  *tuple;
  
-       Assert(!checkIndex || state->currentRun == RUN_FIRST);
         if (--state->memtupcount <= 0)
                 return;
  
@@ -3505,7 +3190,7 @@ tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex)
          * current top node with it.
          */
         tuple = &memtuples[state->memtupcount];
-       tuplesort_heap_replace_top(state, tuple, checkIndex);
+       tuplesort_heap_replace_top(state, tuple);
  }
  
  /*
@@ -3516,14 +3201,12 @@ tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex)
   * Heapsort, steps H3-H8).
   */
  static void
-tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
-                                                  bool checkIndex)
+tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
  {
         SortTuple  *memtuples = state->memtuples;
         unsigned int i,
                                 n;
  
-       Assert(!checkIndex || state->currentRun == RUN_FIRST);
         Assert(state->memtupcount >= 1);
  
         CHECK_FOR_INTERRUPTS();
@@ -3542,9 +3225,9 @@ tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
                 if (j >= n)
                         break;
                 if (j + 1 < n &&
-                       HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0)
+                       COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
                         j++;
-               if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0)
+               if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
                         break;
                 memtuples[i] = memtuples[j];
                 i = j;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h

index dad98de98d7c3560411032d5aa54a76170163c3a..3950054368b4a1758e21c528df1c31edb2d59b3a 100644 (file)
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -241,7 +241,6 @@ extern bool enableFsync;
  extern bool allowSystemTableMods;
  extern PGDLLIMPORT int work_mem;
  extern PGDLLIMPORT int maintenance_work_mem;
-extern PGDLLIMPORT int replacement_sort_tuples;
  
  extern int     VacuumCostPageHit;
  extern int     VacuumCostPageMiss;
diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out

index 097ac2b006068d695a59a086828481ca30056564..82713bfa2c7067b33ab219d7d88566eadf2e125b 100644 (file)
--- a/src/test/regress/expected/cluster.out
+++ b/src/test/regress/expected/cluster.out
@@ -444,22 +444,8 @@ create table clstr_4 as select * from tenk1;
  create index cluster_sort on clstr_4 (hundred, thousand, tenthous);
  -- ensure we don't use the index in CLUSTER nor the checking SELECTs
  set enable_indexscan = off;
--- Use external sort that only ever uses quicksort to sort runs:
+-- Use external sort:
  set maintenance_work_mem = '1MB';
-set replacement_sort_tuples = 0;
-cluster clstr_4 using cluster_sort;
-select * from
-(select hundred, lag(hundred) over () as lhundred,
-        thousand, lag(thousand) over () as lthousand,
-        tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss
-where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
- hundred | lhundred | thousand | lthousand | tenthous | ltenthous 
----------+----------+----------+-----------+----------+-----------
-(0 rows)
-
--- Replacement selection will now be forced.  It should only produce a single
--- run, due to the fact that input is found to be presorted:
-set replacement_sort_tuples = 150000;
  cluster clstr_4 using cluster_sort;
  select * from
  (select hundred, lag(hundred) over () as lhundred,
@@ -472,7 +458,6 @@ where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
  
  reset enable_indexscan;
  reset maintenance_work_mem;
-reset replacement_sort_tuples;
  -- clean up
  DROP TABLE clustertest;
  DROP TABLE clstr_1;
diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql

index 8dd9459bda00c131a8477f2ecb5f0b204c144b10..a6c2757efaaac6ebce58ed3265b7141d3bde3522 100644 (file)
--- a/src/test/regress/sql/cluster.sql
+++ b/src/test/regress/sql/cluster.sql
@@ -203,19 +203,8 @@ create index cluster_sort on clstr_4 (hundred, thousand, tenthous);
  -- ensure we don't use the index in CLUSTER nor the checking SELECTs
  set enable_indexscan = off;
  
--- Use external sort that only ever uses quicksort to sort runs:
+-- Use external sort:
  set maintenance_work_mem = '1MB';
-set replacement_sort_tuples = 0;
-cluster clstr_4 using cluster_sort;
-select * from
-(select hundred, lag(hundred) over () as lhundred,
-        thousand, lag(thousand) over () as lthousand,
-        tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss
-where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
-
--- Replacement selection will now be forced.  It should only produce a single
--- run, due to the fact that input is found to be presorted:
-set replacement_sort_tuples = 150000;
  cluster clstr_4 using cluster_sort;
  select * from
  (select hundred, lag(hundred) over () as lhundred,
@@ -225,7 +214,6 @@ where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
  
  reset enable_indexscan;
  reset maintenance_work_mem;
-reset replacement_sort_tuples;
  
  -- clean up
  DROP TABLE clustertest;
author	Robert Haas <rhaas@postgresql.org>
	Fri, 29 Sep 2017 14:20:44 +0000 (10:20 -0400)
committer	Robert Haas <rhaas@postgresql.org>
	Fri, 29 Sep 2017 14:25:44 +0000 (10:25 -0400)
doc/src/sgml/config.sgml		patch \| blob \| history
doc/src/sgml/release-9.6.sgml		patch \| blob \| history
src/backend/utils/init/globals.c		patch \| blob \| history
src/backend/utils/misc/guc.c		patch \| blob \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| history
src/include/miscadmin.h		patch \| blob \| history
src/test/regress/expected/cluster.out		patch \| blob \| history
src/test/regress/sql/cluster.sql		patch \| blob \| history