Teach CLUSTER to use seqscan-and-sort when it's faster than indexscan.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)
diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml

index 4b641954efa684eb32a6d297c99b3ce06d62b038..adba2678632028612451448e23e736f9e2685a89 100644 (file)
--- a/doc/src/sgml/ref/cluster.sgml
+++ b/doc/src/sgml/ref/cluster.sgml
@@ -128,18 +128,33 @@ CLUSTER [VERBOSE]
     </para>
  
     <para>
-    During the cluster operation, a temporary copy of the table is created
-    that contains the table data in the index order.  Temporary copies of
-    each index on the table are created as well.  Therefore, you need free
-    space on disk at least equal to the sum of the table size and the index
-    sizes.
+    <command>CLUSTER</> can re-sort the table using either an indexscan
+    on the specified index, or (if the index is a b-tree) a sequential
+    scan followed by sorting.  It will attempt to choose the method that
+    will be faster, based on planner cost parameters and available statistical
+    information.
     </para>
  
     <para>
-    Because <command>CLUSTER</command> remembers the clustering information,
-    one can cluster the tables one wants clustered manually the first time, and
-    setup a timed event similar to <command>VACUUM</command> so that the tables
-    are periodically reclustered.
+    When an indexscan is used, a temporary copy of the table is created that
+    contains the table data in the index order.  Temporary copies of each
+    index on the table are created as well.  Therefore, you need free space on
+    disk at least equal to the sum of the table size and the index sizes.
+   </para>
+
+   <para>
+    When a sequential scan and sort is used, a temporary sort file is
+    also created, so that the peak temporary space requirement is as much
+    as double the table size, plus the index sizes.  This method is often
+    faster than the indexscan method, but if the disk space requirement is
+    intolerable, you can disable this choice by temporarily setting <xref
+    linkend="guc-enable-sort"> to <literal>off</>.
+   </para>
+
+   <para>
+    It is advisable to set <xref linkend="guc-maintenance-work-mem"> to
+    a reasonably large value (but not more than the amount of RAM you can
+    dedicate to the <command>CLUSTER</> operation) before clustering.
     </para>
  
     <para>
@@ -150,35 +165,13 @@ CLUSTER [VERBOSE]
     </para>
  
     <para>
-    There is another way to cluster data. The
-    <command>CLUSTER</command> command reorders the original table by
-    scanning it using the index you specify. This can be slow
-    on large tables because the rows are fetched from the table
-    in index order, and if the table is disordered, the
-    entries are on random pages, so there is one disk page
-    retrieved for every row moved. (<productname>PostgreSQL</productname> has
-    a cache, but the majority of a big table will not fit in the cache.)
-    The other way to cluster a table is to use:
-
-<programlisting>
-CREATE TABLE <replaceable class="parameter">newtable</replaceable> AS
-    SELECT * FROM <replaceable class="parameter">table</replaceable> ORDER BY <replaceable class="parameter">columnlist</replaceable>;
-</programlisting>
-
-    which uses the <productname>PostgreSQL</productname> sorting code
-    to produce the desired order;
-    this is usually much faster than an index scan for disordered data.
-    Then you drop the old table, use
-    <command>ALTER TABLE ... RENAME</command>
-    to rename <replaceable class="parameter">newtable</replaceable> to the
-    old name, and recreate the table's indexes.
-    The big disadvantage of this approach is that it does not preserve
-    OIDs, constraints, foreign key relationships, granted privileges, and
-    other ancillary properties of the table &mdash; all such items must be
-    manually recreated.  Another disadvantage is that this way requires a sort
-    temporary file about the same size as the table itself, so peak disk usage
-    is about three times the table size instead of twice the table size.
+    Because <command>CLUSTER</command> remembers which indexes are clustered,
+    one can cluster the tables one wants clustered manually the first time,
+    then set up a periodic maintenance script that executes
+    <command>CLUSTER</> without any parameters, so that the desired tables
+    are periodically reclustered.
     </para>
+
   </refsect1>
  
   <refsect1>
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c

index a2a2bbfa75a00fd1af3650f9822d95a63126f415..f52e39fc36a5b48ad2a57d726bb19800e6d4f776 100644 (file)
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -36,6 +36,7 @@
  #include "commands/trigger.h"
  #include "commands/vacuum.h"
  #include "miscadmin.h"
+#include "optimizer/planner.h"
  #include "storage/bufmgr.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
@@ -49,6 +50,7 @@
  #include "utils/snapmgr.h"
  #include "utils/syscache.h"
  #include "utils/tqual.h"
+#include "utils/tuplesort.h"
  
  
  /*
@@ -69,7 +71,10 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                            int freeze_min_age, int freeze_table_age,
                            bool *pSwapToastByContent, TransactionId *pFreezeXid);
  static List *get_tables_to_cluster(MemoryContext cluster_context);
-
+static void reform_and_rewrite_tuple(HeapTuple tuple,
+                                                TupleDesc oldTupDesc, TupleDesc newTupDesc,
+                                                Datum *values, bool *isnull,
+                                                bool newRelHasOids, RewriteState rwstate);
  
  
  /*---------------------------------------------------------------------------
@@ -759,6 +764,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
         TransactionId OldestXmin;
         TransactionId FreezeXid;
         RewriteState rwstate;
+       bool             use_sort;
+       Tuplesortstate *tuplesort;
  
         /*
          * Open the relations we need.
@@ -845,12 +852,30 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
         rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
  
         /*
-        * Scan through the OldHeap, either in OldIndex order or sequentially, and
-        * copy each tuple into the NewHeap.  To ensure we see recently-dead
-        * tuples that still need to be copied, we scan with SnapshotAny and use
+        * Decide whether to use an indexscan or seqscan-and-optional-sort to
+        * scan the OldHeap.  We know how to use a sort to duplicate the ordering
+        * of a btree index, and will use seqscan-and-sort for that case if the
+        * planner tells us it's cheaper.  Otherwise, always indexscan if an
+        * index is provided, else plain seqscan.
+        */
+       if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
+               use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
+       else
+               use_sort = false;
+
+       /* Set up sorting if wanted */
+       if (use_sort)
+               tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
+                                                                                       maintenance_work_mem, false);
+       else
+               tuplesort = NULL;
+
+       /*
+        * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
+        * that still need to be copied, we scan with SnapshotAny and use
          * HeapTupleSatisfiesVacuum for the visibility test.
          */
-       if (OldIndex != NULL)
+       if (OldIndex != NULL && !use_sort)
         {
                 heapScan = NULL;
                 indexScan = index_beginscan(OldHeap, OldIndex,
@@ -862,17 +887,21 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                 indexScan = NULL;
         }
  
+       /*
+        * Scan through the OldHeap, either in OldIndex order or sequentially;
+        * copy each tuple into the NewHeap, or transiently to the tuplesort
+        * module.  Note that we don't bother sorting dead tuples (they won't
+        * get to the new table anyway).
+        */
         for (;;)
         {
                 HeapTuple       tuple;
-               HeapTuple       copiedTuple;
                 Buffer          buf;
                 bool            isdead;
-               int                     i;
  
                 CHECK_FOR_INTERRUPTS();
  
-               if (OldIndex != NULL)
+               if (indexScan != NULL)
                 {
                         tuple = index_getnext(indexScan, ForwardScanDirection);
                         if (tuple == NULL)
@@ -951,45 +980,50 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                         continue;
                 }
  
-               /*
-                * We cannot simply copy the tuple as-is, for several reasons:
-                *
-                * 1. We'd like to squeeze out the values of any dropped columns, both
-                * to save space and to ensure we have no corner-case failures. (It's
-                * possible for example that the new table hasn't got a TOAST table
-                * and so is unable to store any large values of dropped cols.)
-                *
-                * 2. The tuple might not even be legal for the new table; this is
-                * currently only known to happen as an after-effect of ALTER TABLE
-                * SET WITHOUT OIDS.
-                *
-                * So, we must reconstruct the tuple from component Datums.
-                */
-               heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+               if (tuplesort != NULL)
+                       tuplesort_putheaptuple(tuplesort, tuple);
+               else
+                       reform_and_rewrite_tuple(tuple,
+                                                                        oldTupDesc, newTupDesc,
+                                                                        values, isnull,
+                                                                        NewHeap->rd_rel->relhasoids, rwstate);
+       }
  
-               /* Be sure to null out any dropped columns */
-               for (i = 0; i < natts; i++)
+       if (indexScan != NULL)
+               index_endscan(indexScan);
+       if (heapScan != NULL)
+               heap_endscan(heapScan);
+
+       /*
+        * In scan-and-sort mode, complete the sort, then read out all live
+        * tuples from the tuplestore and write them to the new relation.
+        */
+       if (tuplesort != NULL)
+       {
+               tuplesort_performsort(tuplesort);
+
+               for (;;)
                 {
-                       if (newTupDesc->attrs[i]->attisdropped)
-                               isnull[i] = true;
-               }
+                       HeapTuple       tuple;
+                       bool            shouldfree;
  
-               copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+                       CHECK_FOR_INTERRUPTS();
  
-               /* Preserve OID, if any */
-               if (NewHeap->rd_rel->relhasoids)
-                       HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+                       tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
+                       if (tuple == NULL)
+                               break;
  
-               /* The heap rewrite module does the rest */
-               rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+                       reform_and_rewrite_tuple(tuple,
+                                                                        oldTupDesc, newTupDesc,
+                                                                        values, isnull,
+                                                                        NewHeap->rd_rel->relhasoids, rwstate);
  
-               heap_freetuple(copiedTuple);
-       }
+                       if (shouldfree)
+                               heap_freetuple(tuple);
+               }
  
-       if (OldIndex != NULL)
-               index_endscan(indexScan);
-       else
-               heap_endscan(heapScan);
+               tuplesort_end(tuplesort);
+       }
  
         /* Write out any remaining tuples, and fsync if needed */
         end_heap_rewrite(rwstate);
@@ -1488,3 +1522,50 @@ get_tables_to_cluster(MemoryContext cluster_context)
  
         return rvs;
  }
+
+
+/*
+ * Reconstruct and rewrite the given tuple
+ *
+ * We cannot simply copy the tuple as-is, for several reasons:
+ *
+ * 1. We'd like to squeeze out the values of any dropped columns, both
+ * to save space and to ensure we have no corner-case failures. (It's
+ * possible for example that the new table hasn't got a TOAST table
+ * and so is unable to store any large values of dropped cols.)
+ *
+ * 2. The tuple might not even be legal for the new table; this is
+ * currently only known to happen as an after-effect of ALTER TABLE
+ * SET WITHOUT OIDS.
+ *
+ * So, we must reconstruct the tuple from component Datums.
+ */
+static void
+reform_and_rewrite_tuple(HeapTuple tuple,
+                                                TupleDesc oldTupDesc, TupleDesc newTupDesc,
+                                                Datum *values, bool *isnull,
+                                                bool newRelHasOids, RewriteState rwstate)
+{
+       HeapTuple       copiedTuple;
+       int             i;
+
+       heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+
+       /* Be sure to null out any dropped columns */
+       for (i = 0; i < newTupDesc->natts; i++)
+       {
+               if (newTupDesc->attrs[i]->attisdropped)
+                       isnull[i] = true;
+       }
+
+       copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+
+       /* Preserve OID, if any */
+       if (newRelHasOids)
+               HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+
+       /* The heap rewrite module does the rest */
+       rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+
+       heap_freetuple(copiedTuple);
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index 53aa62fb810978c36e8655626cfac92eb7bcba51..b27dc53feff0b0983fb96bb04683a57dd436ac21 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1071,33 +1071,37 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
   *       Determines and returns the cost of sorting a relation, including
   *       the cost of reading the input data.
   *
- * If the total volume of data to sort is less than work_mem, we will do
+ * If the total volume of data to sort is less than sort_mem, we will do
   * an in-memory sort, which requires no I/O and about t*log2(t) tuple
   * comparisons for t tuples.
   *
- * If the total volume exceeds work_mem, we switch to a tape-style merge
+ * If the total volume exceeds sort_mem, we switch to a tape-style merge
   * algorithm.  There will still be about t*log2(t) tuple comparisons in
   * total, but we will also need to write and read each tuple once per
   * merge pass. We expect about ceil(logM(r)) merge passes where r is the
   * number of initial runs formed and M is the merge order used by tuplesort.c.
- * Since the average initial run should be about twice work_mem, we have
- *             disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
+ * Since the average initial run should be about twice sort_mem, we have
+ *             disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
   *             cpu = comparison_cost * t * log2(t)
   *
   * If the sort is bounded (i.e., only the first k result tuples are needed)
- * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * and k tuples can fit into sort_mem, we use a heap method that keeps only
   * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
   *
   * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
   * accesses (XXX can't we refine that guess?)
   *
- * We charge two operator evals per tuple comparison, which should be in
- * the right ballpark in most cases.
+ * By default, we charge two operator evals per tuple comparison, which should
+ * be in the right ballpark in most cases.  The caller can tweak this by
+ * specifying nonzero comparison_cost; typically that's used for any extra
+ * work that has to be done to prepare the inputs to the comparison operators.
   *
   * 'pathkeys' is a list of sort keys
   * 'input_cost' is the total cost for reading the input data
   * 'tuples' is the number of tuples in the relation
   * 'width' is the average tuple width in bytes
+ * 'comparison_cost' is the extra cost per comparison, if any
+ * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
   * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
   *
   * NOTE: some callers currently pass NIL for pathkeys because they
@@ -1110,6 +1114,7 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
  void
  cost_sort(Path *path, PlannerInfo *root,
                   List *pathkeys, Cost input_cost, double tuples, int width,
+                 Cost comparison_cost, int sort_mem,
                   double limit_tuples)
  {
         Cost            startup_cost = input_cost;
@@ -1117,7 +1122,7 @@ cost_sort(Path *path, PlannerInfo *root,
         double          input_bytes = relation_byte_size(tuples, width);
         double          output_bytes;
         double          output_tuples;
-       long            work_mem_bytes = work_mem * 1024L;
+       long            sort_mem_bytes = sort_mem * 1024L;
  
         if (!enable_sort)
                 startup_cost += disable_cost;
@@ -1129,6 +1134,9 @@ cost_sort(Path *path, PlannerInfo *root,
         if (tuples < 2.0)
                 tuples = 2.0;
  
+       /* Include the default cost-per-comparison */
+       comparison_cost += 2.0 * cpu_operator_cost;
+
         /* Do we have a useful LIMIT? */
         if (limit_tuples > 0 && limit_tuples < tuples)
         {
@@ -1141,24 +1149,23 @@ cost_sort(Path *path, PlannerInfo *root,
                 output_bytes = input_bytes;
         }
  
-       if (output_bytes > work_mem_bytes)
+       if (output_bytes > sort_mem_bytes)
         {
                 /*
                  * We'll have to use a disk-based sort of all the tuples
                  */
                 double          npages = ceil(input_bytes / BLCKSZ);
-               double          nruns = (input_bytes / work_mem_bytes) * 0.5;
-               double          mergeorder = tuplesort_merge_order(work_mem_bytes);
+               double          nruns = (input_bytes / sort_mem_bytes) * 0.5;
+               double          mergeorder = tuplesort_merge_order(sort_mem_bytes);
                 double          log_runs;
                 double          npageaccesses;
  
                 /*
                  * CPU costs
                  *
-                * Assume about two operator evals per tuple comparison and N log2 N
-                * comparisons
+                * Assume about N log2 N comparisons
                  */
-               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+               startup_cost += comparison_cost * tuples * LOG2(tuples);
  
                 /* Disk costs */
  
@@ -1172,7 +1179,7 @@ cost_sort(Path *path, PlannerInfo *root,
                 startup_cost += npageaccesses *
                         (seq_page_cost * 0.75 + random_page_cost * 0.25);
         }
-       else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+       else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
         {
                 /*
                  * We'll use a bounded heap-sort keeping just K tuples in memory, for
@@ -1180,12 +1187,12 @@ cost_sort(Path *path, PlannerInfo *root,
                  * factor is a bit higher than for quicksort.  Tweak it so that the
                  * cost curve is continuous at the crossover point.
                  */
-               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+               startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
         }
         else
         {
                 /* We'll use plain quicksort on all the input tuples */
-               startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+               startup_cost += comparison_cost * tuples * LOG2(tuples);
         }
  
         /*
@@ -1786,6 +1793,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
                                   outer_path->total_cost,
                                   outer_path_rows,
                                   outer_path->parent->width,
+                                 0.0,
+                                 work_mem,
                                   -1.0);
                 startup_cost += sort_path.startup_cost;
                 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
@@ -1810,6 +1819,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
                                   inner_path->total_cost,
                                   inner_path_rows,
                                   inner_path->parent->width,
+                                 0.0,
+                                 work_mem,
                                   -1.0);
                 startup_cost += sort_path.startup_cost;
                 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c

index fa7b29f7d4f497e6b69eba6f79be2ebe4f2224c2..2c398d2eed9b116dd47333123863fee957cb697d 100644 (file)
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -20,6 +20,7 @@
  #include <math.h>
  
  #include "access/skey.h"
+#include "miscadmin.h"
  #include "nodes/makefuncs.h"
  #include "nodes/nodeFuncs.h"
  #include "optimizer/clauses.h"
@@ -3041,6 +3042,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
                           lefttree->total_cost,
                           lefttree->plan_rows,
                           lefttree->plan_width,
+                         0.0,
+                         work_mem,
                           limit_tuples);
         plan->startup_cost = sort_path.startup_cost;
         plan->total_cost = sort_path.total_cost;
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c

index 9e884cbb3cb5e94014e627e669ff974b8fb393b1..fd4c6f54d0f98600fdb3476849ff9cda2741b9b3 100644 (file)
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -20,6 +20,7 @@
   */
  #include "postgres.h"
  
+#include "miscadmin.h"
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/paths.h"
@@ -415,7 +416,7 @@ query_planner(PlannerInfo *root, List *tlist,
                         cost_sort(&sort_path, root, root->query_pathkeys,
                                           cheapestpath->total_cost,
                                           final_rel->rows, final_rel->width,
-                                         limit_tuples);
+                                         0.0, work_mem, limit_tuples);
                 }
  
                 if (compare_fractional_path_costs(sortedpath, &sort_path,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 9cf5995ce38ec35b31344791d9ec0a0e7025c957..93daedc706126e80e1a2002085dd019345827b12 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -26,6 +26,7 @@
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/paths.h"
+#include "optimizer/plancat.h"
  #include "optimizer/planmain.h"
  #include "optimizer/planner.h"
  #include "optimizer/prep.h"
@@ -2276,7 +2277,8 @@ choose_hashed_grouping(PlannerInfo *root,
         /* Result of hashed agg is always unsorted */
         if (target_pathkeys)
                 cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost,
-                                 dNumGroups, path_width, limit_tuples);
+                                 dNumGroups, path_width,
+                                 0.0, work_mem, limit_tuples);
  
         if (sorted_path)
         {
@@ -2293,7 +2295,8 @@ choose_hashed_grouping(PlannerInfo *root,
         if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
         {
                 cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
-                                 path_rows, path_width, -1.0);
+                                 path_rows, path_width,
+                                 0.0, work_mem, -1.0);
                 current_pathkeys = root->group_pathkeys;
         }
  
@@ -2310,7 +2313,8 @@ choose_hashed_grouping(PlannerInfo *root,
         if (target_pathkeys &&
                 !pathkeys_contained_in(target_pathkeys, current_pathkeys))
                 cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost,
-                                 dNumGroups, path_width, limit_tuples);
+                                 dNumGroups, path_width,
+                                 0.0, work_mem, limit_tuples);
  
         /*
          * Now make the decision using the top-level tuple fraction.  First we
@@ -2427,7 +2431,8 @@ choose_hashed_distinct(PlannerInfo *root,
          */
         if (parse->sortClause)
                 cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
-                                 dNumDistinctRows, path_width, limit_tuples);
+                                 dNumDistinctRows, path_width,
+                                 0.0, work_mem, limit_tuples);
  
         /*
          * Now for the GROUP case.      See comments in grouping_planner about the
@@ -2450,7 +2455,8 @@ choose_hashed_distinct(PlannerInfo *root,
                 else
                         current_pathkeys = root->sort_pathkeys;
                 cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost,
-                                 path_rows, path_width, -1.0);
+                                 path_rows, path_width,
+                                 0.0, work_mem, -1.0);
         }
         cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows,
                            sorted_p.startup_cost, sorted_p.total_cost,
@@ -2458,7 +2464,8 @@ choose_hashed_distinct(PlannerInfo *root,
         if (parse->sortClause &&
                 !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
                 cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
-                                 dNumDistinctRows, path_width, limit_tuples);
+                                 dNumDistinctRows, path_width,
+                                 0.0, work_mem, limit_tuples);
  
         /*
          * Now make the decision using the top-level tuple fraction.  First we
@@ -2997,3 +3004,107 @@ expression_planner(Expr *expr)
  
         return (Expr *) result;
  }
+
+
+/*
+ * plan_cluster_use_sort
+ *             Use the planner to decide how CLUSTER should implement sorting
+ *
+ * tableOid is the OID of a table to be clustered on its index indexOid
+ * (which is already known to be a btree index).  Decide whether it's
+ * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER.
+ * Return TRUE to use sorting, FALSE to use an indexscan.
+ *
+ * Note: caller had better already hold some type of lock on the table.
+ */
+bool
+plan_cluster_use_sort(Oid tableOid, Oid indexOid)
+{
+       PlannerInfo *root;
+       Query      *query;
+       PlannerGlobal *glob;
+       RangeTblEntry *rte;
+       RelOptInfo *rel;
+       IndexOptInfo *indexInfo;
+       QualCost        indexExprCost;
+       Cost            comparisonCost;
+       Path       *seqScanPath;
+       Path            seqScanAndSortPath;
+       IndexPath  *indexScanPath;
+       ListCell   *lc;
+
+       /* Set up mostly-dummy planner state */
+       query = makeNode(Query);
+       query->commandType = CMD_SELECT;
+
+       glob = makeNode(PlannerGlobal);
+
+       root = makeNode(PlannerInfo);
+       root->parse = query;
+       root->glob = glob;
+       root->query_level = 1;
+       root->planner_cxt = CurrentMemoryContext;
+       root->wt_param_id = -1;
+
+       /* Build a minimal RTE for the rel */
+       rte = makeNode(RangeTblEntry);
+       rte->rtekind = RTE_RELATION;
+       rte->relid = tableOid;
+       rte->inh = false;
+       rte->inFromCl = true;
+       query->rtable = list_make1(rte);
+
+       /* ... and insert it into PlannerInfo */
+       root->simple_rel_array_size = 2;
+       root->simple_rel_array = (RelOptInfo **)
+               palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));
+       root->simple_rte_array = (RangeTblEntry **)
+               palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
+       root->simple_rte_array[1] = rte;
+
+       /* Build RelOptInfo */
+       rel = build_simple_rel(root, 1, RELOPT_BASEREL);
+
+       /*
+        * Rather than doing all the pushups that would be needed to use
+        * set_baserel_size_estimates, just do a quick hack for rows and width.
+        */
+       rel->rows = rel->tuples;
+       rel->width = get_relation_data_width(tableOid);
+
+       root->total_table_pages = rel->pages;
+
+       /* Locate IndexOptInfo for the target index */
+       indexInfo = NULL;
+       foreach(lc, rel->indexlist)
+       {
+               indexInfo = (IndexOptInfo *) lfirst(lc);
+               if (indexInfo->indexoid == indexOid)
+                       break;
+       }
+       if (lc == NULL)                         /* not in the list? */
+               elog(ERROR, "index %u does not belong to table %u",
+                        indexOid, tableOid);
+
+       /*
+        * Determine eval cost of the index expressions, if any.  We need to
+        * charge twice that amount for each tuple comparison that happens
+        * during the sort, since tuplesort.c will have to re-evaluate the
+        * index expressions each time.  (XXX that's pretty inefficient...)
+        */
+       cost_qual_eval(&indexExprCost, indexInfo->indexprs, root);
+       comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple);
+
+       /* Estimate the cost of seq scan + sort */
+       seqScanPath = create_seqscan_path(root, rel);
+       cost_sort(&seqScanAndSortPath, root, NIL,
+                         seqScanPath->total_cost, rel->tuples, rel->width,
+                         comparisonCost, maintenance_work_mem, -1.0);
+
+       /* Estimate the cost of index scan */
+       indexScanPath = create_index_path(root, indexInfo,
+                                                                         NIL, NIL,
+                                                                         ForwardScanDirection, NULL);
+
+       return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
+}
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c

index f904258280cf8821877fc5219f90258c7f9af05d..0d3a739175a10fde956263c4c83cfb1678d2d03b 100644 (file)
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -805,7 +805,8 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
         sorted_p.total_cost = input_plan->total_cost;
         /* XXX cost_sort doesn't actually look at pathkeys, so just pass NIL */
         cost_sort(&sorted_p, root, NIL, sorted_p.total_cost,
-                         input_plan->plan_rows, input_plan->plan_width, -1.0);
+                         input_plan->plan_rows, input_plan->plan_width,
+                         0.0, work_mem, -1.0);
         cost_group(&sorted_p, root, numGroupCols, dNumGroups,
                            sorted_p.startup_cost, sorted_p.total_cost,
                            input_plan->plan_rows);
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index f8aa745fef6b49247bd1d7b92815cd05d6b4d387..71e0e75a56901c7c6e77ace42c10baf406aee5d8 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -969,6 +969,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
                                   subpath->total_cost,
                                   rel->rows,
                                   rel->width,
+                                 0.0,
+                                 work_mem,
                                   -1.0);
  
                 /*
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c

index ad71d3a4f9e67b4183d83b303624c765395a2e83..7ffa11588d7d2fc5b9b6131ff4f3d131b7fc15be 100644 (file)
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -46,6 +46,7 @@ int                   constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
  get_relation_info_hook_type get_relation_info_hook = NULL;
  
  
+static int32 get_rel_data_width(Relation rel, int32 *attr_widths);
  static List *get_relation_constraints(PlannerInfo *root,
                                                  Oid relationObjectId, RelOptInfo *rel,
                                                  bool include_notnull);
@@ -406,28 +407,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
                                  * platform dependencies in the default plans which are kind
                                  * of a headache for regression testing.
                                  */
-                               int32           tuple_width = 0;
-                               int                     i;
+                               int32           tuple_width;
  
-                               for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
-                               {
-                                       Form_pg_attribute att = rel->rd_att->attrs[i - 1];
-                                       int32           item_width;
-
-                                       if (att->attisdropped)
-                                               continue;
-                                       /* This should match set_rel_width() in costsize.c */
-                                       item_width = get_attavgwidth(RelationGetRelid(rel), i);
-                                       if (item_width <= 0)
-                                       {
-                                               item_width = get_typavgwidth(att->atttypid,
-                                                                                                        att->atttypmod);
-                                               Assert(item_width > 0);
-                                       }
-                                       if (attr_widths != NULL)
-                                               attr_widths[i] = item_width;
-                                       tuple_width += item_width;
-                               }
+                               tuple_width = get_rel_data_width(rel, attr_widths);
                                 tuple_width += sizeof(HeapTupleHeaderData);
                                 tuple_width += sizeof(ItemPointerData);
                                 /* note: integer division is intentional here */
@@ -449,6 +431,68 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
  }
  
  
+/*
+ * get_rel_data_width
+ *
+ * Estimate the average width of (the data part of) the relation's tuples.
+ * If attr_widths isn't NULL, also store per-column width estimates into
+ * that array.
+ *
+ * Currently we ignore dropped columns.  Ideally those should be included
+ * in the result, but we haven't got any way to get info about them; and
+ * since they might be mostly NULLs, treating them as zero-width is not
+ * necessarily the wrong thing anyway.
+ */
+static int32
+get_rel_data_width(Relation rel, int32 *attr_widths)
+{
+       int32           tuple_width = 0;
+       int                     i;
+
+       for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
+       {
+               Form_pg_attribute att = rel->rd_att->attrs[i - 1];
+               int32           item_width;
+
+               if (att->attisdropped)
+                       continue;
+               /* This should match set_rel_width() in costsize.c */
+               item_width = get_attavgwidth(RelationGetRelid(rel), i);
+               if (item_width <= 0)
+               {
+                       item_width = get_typavgwidth(att->atttypid, att->atttypmod);
+                       Assert(item_width > 0);
+               }
+               if (attr_widths != NULL)
+                       attr_widths[i] = item_width;
+               tuple_width += item_width;
+       }
+
+       return tuple_width;
+}
+
+/*
+ * get_relation_data_width
+ *
+ * External API for get_rel_data_width
+ */
+int32
+get_relation_data_width(Oid relid)
+{
+       int32           result;
+       Relation        relation;
+
+       /* As above, assume relation is already locked */
+       relation = heap_open(relid, NoLock);
+
+       result = get_rel_data_width(relation, NULL);
+
+       heap_close(relation, NoLock);
+
+       return result;
+}
+
+
  /*
   * get_relation_constraints
   *
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index cf0a583f5c32a884397b8202ddf1a26c9e669e03..00137032505159a50b8388b871effa243ff3a874 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -102,9 +102,11 @@
  
  #include "access/genam.h"
  #include "access/nbtree.h"
+#include "catalog/index.h"
  #include "catalog/pg_amop.h"
  #include "catalog/pg_operator.h"
  #include "commands/tablespace.h"
+#include "executor/executor.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
  #include "utils/datum.h"
@@ -121,6 +123,7 @@
  #define HEAP_SORT      0
  #define INDEX_SORT     1
  #define DATUM_SORT     2
+#define CLUSTER_SORT   3
  
  /* GUC variables */
  #ifdef TRACE_SORT
@@ -342,6 +345,14 @@ struct Tuplesortstate
         TupleDesc       tupDesc;
         ScanKey         scanKeys;               /* array of length nKeys */
  
+       /*
+        * These variables are specific to the CLUSTER case; they are set by
+        * tuplesort_begin_cluster.  Note CLUSTER also uses tupDesc and
+        * indexScanKey.
+        */
+       IndexInfo  *indexInfo;          /* info about index being used for reference */
+       EState     *estate;                     /* for evaluating index expressions */
+
         /*
          * These variables are specific to the IndexTuple case; they are set by
          * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
@@ -450,6 +461,13 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
                          int tapenum, unsigned int len);
  static void reversedirection_heap(Tuplesortstate *state);
+static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
+                                                         Tuplesortstate *state);
+static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
+static void writetup_cluster(Tuplesortstate *state, int tapenum,
+                                                        SortTuple *stup);
+static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
+                                                       int tapenum, unsigned int len);
  static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
                                            Tuplesortstate *state);
  static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
@@ -627,6 +645,67 @@ tuplesort_begin_heap(TupleDesc tupDesc,
         return state;
  }
  
+Tuplesortstate *
+tuplesort_begin_cluster(TupleDesc tupDesc,
+                                               Relation indexRel,
+                                               int workMem, bool randomAccess)
+{
+       Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+       MemoryContext oldcontext;
+
+       Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
+
+       oldcontext = MemoryContextSwitchTo(state->sortcontext);
+
+#ifdef TRACE_SORT
+       if (trace_sort)
+               elog(LOG,
+                        "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
+                        RelationGetNumberOfAttributes(indexRel),
+                        workMem, randomAccess ? 't' : 'f');
+#endif
+
+       state->nKeys = RelationGetNumberOfAttributes(indexRel);
+
+       TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
+                                                               false,  /* no unique check */
+                                                               state->nKeys,
+                                                               workMem,
+                                                               randomAccess);
+
+       state->comparetup = comparetup_cluster;
+       state->copytup = copytup_cluster;
+       state->writetup = writetup_cluster;
+       state->readtup = readtup_cluster;
+       state->reversedirection = reversedirection_index_btree;
+
+       state->indexInfo = BuildIndexInfo(indexRel);
+       state->indexScanKey = _bt_mkscankey_nodata(indexRel);
+
+       state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
+
+       if (state->indexInfo->ii_Expressions != NULL)
+       {
+               TupleTableSlot *slot;
+               ExprContext        *econtext;
+
+               /*
+                * We will need to use FormIndexDatum to evaluate the index
+                * expressions.  To do that, we need an EState, as well as a
+                * TupleTableSlot to put the table tuples into.  The econtext's
+                * scantuple has to point to that slot, too.
+                */
+               state->estate = CreateExecutorState();
+               slot = MakeSingleTupleTableSlot(tupDesc);
+               econtext = GetPerTupleExprContext(state->estate);
+               econtext->ecxt_scantuple = slot;
+       }
+
+       MemoryContextSwitchTo(oldcontext);
+
+       return state;
+}
+
  Tuplesortstate *
  tuplesort_begin_index_btree(Relation indexRel,
                                                         bool enforceUnique,
@@ -850,6 +929,15 @@ tuplesort_end(Tuplesortstate *state)
         TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
  #endif
  
+       /* Free any execution state created for CLUSTER case */
+       if (state->estate != NULL)
+       {
+               ExprContext        *econtext = GetPerTupleExprContext(state->estate);
+
+               ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
+               FreeExecutorState(state->estate);
+       }
+
         MemoryContextSwitchTo(oldcontext);
  
         /*
@@ -923,6 +1011,28 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
         MemoryContextSwitchTo(oldcontext);
  }
  
+/*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input data is always copied; the caller need not save it.
+ */
+void
+tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
+{
+       MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
+       SortTuple       stup;
+
+       /*
+        * Copy the given tuple into memory we control, and decrease availMem.
+        * Then call the common code.
+        */
+       COPYTUP(state, &stup, (void *) tup);
+
+       puttuple_common(state, &stup);
+
+       MemoryContextSwitchTo(oldcontext);
+}
+
  /*
   * Accept one index tuple while collecting input data for sort.
   *
@@ -1421,6 +1531,25 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
         }
  }
  
+/*
+ * Fetch the next tuple in either forward or back direction.
+ * Returns NULL if no more tuples.  If *should_free is set, the
+ * caller must pfree the returned tuple when done with it.
+ */
+HeapTuple
+tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free)
+{
+       MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
+       SortTuple       stup;
+
+       if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
+               stup.tuple = NULL;
+
+       MemoryContextSwitchTo(oldcontext);
+
+       return stup.tuple;
+}
+
  /*
   * Fetch the next index tuple in either forward or back direction.
   * Returns NULL if no more tuples.     If *should_free is set, the
@@ -2712,6 +2841,187 @@ reversedirection_heap(Tuplesortstate *state)
  }
  
  
+/*
+ * Routines specialized for the CLUSTER case (HeapTuple data, with
+ * comparisons per a btree index definition)
+ */
+
+static int
+comparetup_cluster(const SortTuple *a, const SortTuple *b,
+                                  Tuplesortstate *state)
+{
+       ScanKey         scanKey = state->indexScanKey;
+       HeapTuple       ltup;
+       HeapTuple       rtup;
+       TupleDesc       tupDesc;
+       int                     nkey;
+       int32           compare;
+
+       /* Allow interrupting long sorts */
+       CHECK_FOR_INTERRUPTS();
+
+       /* Compare the leading sort key, if it's simple */
+       if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+       {
+               compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags,
+                                                                                 a->datum1, a->isnull1,
+                                                                                 b->datum1, b->isnull1);
+               if (compare != 0 || state->nKeys == 1)
+                       return compare;
+               /* Compare additional columns the hard way */
+               scanKey++;
+               nkey = 1;
+       }
+       else
+       {
+               /* Must compare all keys the hard way */
+               nkey = 0;
+       }
+
+       /* Compare additional sort keys */
+       ltup = (HeapTuple) a->tuple;
+       rtup = (HeapTuple) b->tuple;
+
+       if (state->indexInfo->ii_Expressions == NULL)
+       {
+               /* If not expression index, just compare the proper heap attrs */
+               tupDesc = state->tupDesc;
+
+               for (; nkey < state->nKeys; nkey++, scanKey++)
+               {
+                       AttrNumber      attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
+                       Datum           datum1,
+                                               datum2;
+                       bool            isnull1,
+                                               isnull2;
+
+                       datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
+                       datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
+
+                       compare = inlineApplySortFunction(&scanKey->sk_func,
+                                                                                         scanKey->sk_flags,
+                                                                                         datum1, isnull1,
+                                                                                         datum2, isnull2);
+                       if (compare != 0)
+                               return compare;
+               }
+       }
+       else
+       {
+               /*
+                * In the expression index case, compute the whole index tuple and
+                * then compare values.  It would perhaps be faster to compute only as
+                * many columns as we need to compare, but that would require
+                * duplicating all the logic in FormIndexDatum.
+                */
+               Datum           l_index_values[INDEX_MAX_KEYS];
+               bool            l_index_isnull[INDEX_MAX_KEYS];
+               Datum           r_index_values[INDEX_MAX_KEYS];
+               bool            r_index_isnull[INDEX_MAX_KEYS];
+               TupleTableSlot *ecxt_scantuple;
+
+               /* Reset context each time to prevent memory leakage */
+               ResetPerTupleExprContext(state->estate);
+
+               ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+
+               ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
+               FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+                                          l_index_values, l_index_isnull);
+
+               ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
+               FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+                                          r_index_values, r_index_isnull);
+
+               for (; nkey < state->nKeys; nkey++, scanKey++)
+               {
+                       compare = inlineApplySortFunction(&scanKey->sk_func,
+                                                                                         scanKey->sk_flags,
+                                                                                         l_index_values[nkey],
+                                                                                         l_index_isnull[nkey],
+                                                                                         r_index_values[nkey],
+                                                                                         r_index_isnull[nkey]);
+                       if (compare != 0)
+                               return compare;
+               }
+       }
+
+       return 0;
+}
+
+static void
+copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
+{
+       HeapTuple       tuple = (HeapTuple) tup;
+
+       /* copy the tuple into sort storage */
+       tuple = heap_copytuple(tuple);
+       stup->tuple = (void *) tuple;
+       USEMEM(state, GetMemoryChunkSpace(tuple));
+       /* set up first-column key value, if it's a simple column */
+       if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+               stup->datum1 = heap_getattr(tuple,
+                                                                       state->indexInfo->ii_KeyAttrNumbers[0],
+                                                                       state->tupDesc,
+                                                                       &stup->isnull1);
+}
+
+static void
+writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
+{
+       HeapTuple       tuple = (HeapTuple) stup->tuple;
+       unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
+
+       /* We need to store t_self, but not other fields of HeapTupleData */
+       LogicalTapeWrite(state->tapeset, tapenum,
+                                        &tuplen, sizeof(tuplen));
+       LogicalTapeWrite(state->tapeset, tapenum,
+                                        &tuple->t_self, sizeof(ItemPointerData));
+       LogicalTapeWrite(state->tapeset, tapenum,
+                                        tuple->t_data, tuple->t_len);
+       if (state->randomAccess)        /* need trailing length word? */
+               LogicalTapeWrite(state->tapeset, tapenum,
+                                                &tuplen, sizeof(tuplen));
+
+       FREEMEM(state, GetMemoryChunkSpace(tuple));
+       heap_freetuple(tuple);
+}
+
+static void
+readtup_cluster(Tuplesortstate *state, SortTuple *stup,
+                               int tapenum, unsigned int tuplen)
+{
+       unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
+       HeapTuple       tuple = (HeapTuple) palloc(t_len + HEAPTUPLESIZE);
+
+       USEMEM(state, GetMemoryChunkSpace(tuple));
+       /* Reconstruct the HeapTupleData header */
+       tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
+       tuple->t_len = t_len;
+       if (LogicalTapeRead(state->tapeset, tapenum,
+                                               &tuple->t_self,
+                                               sizeof(ItemPointerData)) != sizeof(ItemPointerData))
+               elog(ERROR, "unexpected end of data");
+       /* We don't currently bother to reconstruct t_tableOid */
+       tuple->t_tableOid = InvalidOid;
+       /* Read in the tuple body */
+       if (LogicalTapeRead(state->tapeset, tapenum,
+                                               tuple->t_data, tuple->t_len) != tuple->t_len)
+               elog(ERROR, "unexpected end of data");
+       if (state->randomAccess)        /* need trailing length word? */
+               if (LogicalTapeRead(state->tapeset, tapenum, &tuplen,
+                                                       sizeof(tuplen)) != sizeof(tuplen))
+                       elog(ERROR, "unexpected end of data");
+       stup->tuple = (void *) tuple;
+       /* set up first-column key value, if it's a simple column */
+       if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+               stup->datum1 = heap_getattr(tuple,
+                                                                       state->indexInfo->ii_KeyAttrNumbers[0],
+                                                                       state->tupDesc,
+                                                                       &stup->isnull1);
+}
+
+
  /*
   * Routines specialized for IndexTuple case
   *
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h

index 63641b9cc8419d29aed2d27bcddef66dc57280a6..5a4b33f2b13330f27cd73ad3b24f396482d58c73 100644 (file)
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -84,6 +84,7 @@ extern void cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel);
  extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm);
  extern void cost_sort(Path *path, PlannerInfo *root,
                   List *pathkeys, Cost input_cost, double tuples, int width,
+                 Cost comparison_cost, int sort_mem,
                   double limit_tuples);
  extern void cost_material(Path *path,
                           Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h

index 0b8429585427c3fabe2d656f2781c8cf4a33343c..de7de84cb3b7fd4fc883cfe8e13cb2662ca79cd6 100644 (file)
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -31,6 +31,8 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
  extern void estimate_rel_size(Relation rel, int32 *attr_widths,
                                   BlockNumber *pages, double *tuples);
  
+extern int32 get_relation_data_width(Oid relid);
+
  extern bool relation_excluded_by_constraints(PlannerInfo *root,
                                                                  RelOptInfo *rel, RangeTblEntry *rte);
  
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h

index 8552d6eeb0aa9d92e54d2d5d5819cb00ccdf3196..ef7287d8b590fc3f33f2634fe2ba5dd13feee4b9 100644 (file)
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -37,4 +37,6 @@ extern Plan *subquery_planner(PlannerGlobal *glob, Query *parse,
  
  extern Expr *expression_planner(Expr *expr);
  
+extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
+
  #endif   /* PLANNER_H */
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h

index d879ff081de1ffbe831a153e3d4f78cc7a489600..8a31dff34c2f476a95fcc0a8c48597f0f9c83eb8 100644 (file)
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -32,29 +32,39 @@
  typedef struct Tuplesortstate Tuplesortstate;
  
  /*
- * We provide two different interfaces to what is essentially the same
- * code: one for sorting HeapTuples and one for sorting IndexTuples.
- * They differ primarily in the way that the sort key information is
- * supplied.  Also, the HeapTuple case actually stores MinimalTuples,
- * which means it doesn't preserve the "system columns" (tuple identity and
- * transaction visibility info).  The IndexTuple case does preserve all
- * the header fields of an index entry.  In the HeapTuple case we can
- * save some cycles by passing and returning the tuples in TupleTableSlots,
- * rather than forming actual HeapTuples (which'd have to be converted to
- * MinimalTuples).
+ * We provide multiple interfaces to what is essentially the same code,
+ * since different callers have different data to be sorted and want to
+ * specify the sort key information differently.  There are two APIs for
+ * sorting HeapTuples and two more for sorting IndexTuples.  Yet another
+ * API supports sorting bare Datums.
   *
- * The IndexTuple case is itself broken into two subcases, one for btree
- * indexes and one for hash indexes; the latter variant actually sorts
- * the tuples by hash code.  The API is the same except for the "begin"
- * routine.
+ * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
+ * preserve the system columns (tuple identity and transaction visibility
+ * info).  The sort keys are specified by column numbers within the tuples
+ * and sort operator OIDs.  We save some cycles by passing and returning the
+ * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
+ * have to be converted to MinimalTuples).  This API works well for sorts
+ * executed as parts of plan trees.
   *
- * Yet another slightly different interface supports sorting bare Datums.
+ * The "cluster" API stores/sorts full HeapTuples including all visibility
+ * info. The sort keys are specified by reference to a btree index that is
+ * defined on the relation to be sorted.  Note that putheaptuple/getheaptuple
+ * go with this API, not the "begin_heap" one!
+ *
+ * The "index_btree" API stores/sorts IndexTuples (preserving all their
+ * header fields).  The sort keys are specified by a btree index definition.
+ *
+ * The "index_hash" API is similar to index_btree, but the tuples are
+ * actually sorted by their hash codes not the raw data.
   */
  
  extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
                                          int nkeys, AttrNumber *attNums,
                                          Oid *sortOperators, bool *nullsFirstFlags,
                                          int workMem, bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
+                                                                                          Relation indexRel,
+                                                                                          int workMem, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_index_btree(Relation indexRel,
                                                         bool enforceUnique,
                                                         int workMem, bool randomAccess);
@@ -69,6 +79,7 @@ extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
  
  extern void tuplesort_puttupleslot(Tuplesortstate *state,
                                            TupleTableSlot *slot);
+extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
  extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);
  extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
                                    bool isNull);
@@ -77,6 +88,8 @@ extern void tuplesort_performsort(Tuplesortstate *state);
  
  extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
                                            TupleTableSlot *slot);
+extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward,
+                                          bool *should_free);
  extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward,
                                                 bool *should_free);
  extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 8 Oct 2010 00:00:28 +0000 (20:00 -0400)
doc/src/sgml/ref/cluster.sgml		patch \| blob \| history
src/backend/commands/cluster.c		patch \| blob \| history
src/backend/optimizer/path/costsize.c		patch \| blob \| history
src/backend/optimizer/plan/createplan.c		patch \| blob \| history
src/backend/optimizer/plan/planmain.c		patch \| blob \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| history
src/backend/optimizer/prep/prepunion.c		patch \| blob \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| history
src/backend/optimizer/util/plancat.c		patch \| blob \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| history
src/include/optimizer/cost.h		patch \| blob \| history
src/include/optimizer/plancat.h		patch \| blob \| history
src/include/optimizer/planner.h		patch \| blob \| history
src/include/utils/tuplesort.h		patch \| blob \| history