]> granicus.if.org Git - postgresql/commitdiff
Improve SELECT DISTINCT to consider hash aggregation, as well as sort/uniq,
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 5 Aug 2008 02:43:18 +0000 (02:43 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 5 Aug 2008 02:43:18 +0000 (02:43 +0000)
as methods for implementing the DISTINCT step.  This eliminates the former
performance gap between DISTINCT and GROUP BY, and also makes it possible
to do SELECT DISTINCT on datatypes that only support hashing not sorting.

SELECT DISTINCT ON is still always implemented by sorting; it would take
executor changes to support hashing that, and it's not clear it's worth
the trouble.

This is a release-note-worthy incompatibility from previous PG versions,
since SELECT DISTINCT can no longer be counted on to deliver sorted output
without explicitly saying ORDER BY.  (Anyone who can't cope with that
can consider turning off enable_hashagg.)

Several regression test queries needed to have ORDER BY added to preserve
stable output order.  I fixed the ones that manifested here, but there
might be some other cases that show up on other platforms.

13 files changed:
src/backend/nodes/outfuncs.c
src/backend/optimizer/plan/planmain.c
src/backend/optimizer/plan/planner.c
src/backend/parser/parse_clause.c
src/include/nodes/relation.h
src/test/regress/expected/numerology.out
src/test/regress/expected/opr_sanity.out
src/test/regress/expected/select_distinct.out
src/test/regress/input/misc.source
src/test/regress/output/misc.source
src/test/regress/sql/numerology.sql
src/test/regress/sql/opr_sanity.sql
src/test/regress/sql/select_distinct.sql

index 66630ae6612a8acaabbfe43c6f88845f323e2940..408b9b2a757e24e08b1ff44ba473c31f0e545217 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.329 2008/08/02 21:31:59 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.330 2008/08/05 02:43:17 tgl Exp $
  *
  * NOTES
  *       Every node type that can appear in stored rules' parsetrees *must*
@@ -1334,6 +1334,7 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node)
        WRITE_NODE_FIELD(append_rel_list);
        WRITE_NODE_FIELD(query_pathkeys);
        WRITE_NODE_FIELD(group_pathkeys);
+       WRITE_NODE_FIELD(distinct_pathkeys);
        WRITE_NODE_FIELD(sort_pathkeys);
        WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
        WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
index 5e5da6cda4f1bd6eeff833a97ddd3373830375fa..081a7c9cebd1f37615cd42e2a49d3d29b706e42e 100644 (file)
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.108 2008/08/03 19:10:52 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.109 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -66,9 +66,9 @@
  * PlannerInfo field and not a passed parameter is that the low-level routines
  * in indxpath.c need to see it.)
  *
- * Note: the PlannerInfo node also includes group_pathkeys and sort_pathkeys,
- * which like query_pathkeys need to be canonicalized once the info is
- * available.
+ * Note: the PlannerInfo node also includes group_pathkeys, distinct_pathkeys,
+ * and sort_pathkeys, which like query_pathkeys need to be canonicalized once
+ * the info is available.
  *
  * tuple_fraction is interpreted as follows:
  *       0: expect all tuples to be retrieved (normal case)
@@ -120,6 +120,8 @@ query_planner(PlannerInfo *root, List *tlist,
                                                                                                         root->query_pathkeys);
                root->group_pathkeys = canonicalize_pathkeys(root,
                                                                                                         root->group_pathkeys);
+               root->distinct_pathkeys = canonicalize_pathkeys(root,
+                                                                                                       root->distinct_pathkeys);
                root->sort_pathkeys = canonicalize_pathkeys(root,
                                                                                                        root->sort_pathkeys);
                return;
@@ -237,10 +239,12 @@ query_planner(PlannerInfo *root, List *tlist,
        /*
         * We have completed merging equivalence sets, so it's now possible to
         * convert the requested query_pathkeys to canonical form.      Also
-        * canonicalize the groupClause and sortClause pathkeys for use later.
+        * canonicalize the groupClause, distinctClause and sortClause pathkeys
+        * for use later.
         */
        root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys);
        root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys);
+       root->distinct_pathkeys = canonicalize_pathkeys(root, root->distinct_pathkeys);
        root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys);
 
        /*
@@ -286,9 +290,11 @@ query_planner(PlannerInfo *root, List *tlist,
                /*
                 * If both GROUP BY and ORDER BY are specified, we will need two
                 * levels of sort --- and, therefore, certainly need to read all the
-                * tuples --- unless ORDER BY is a subset of GROUP BY.
+                * tuples --- unless ORDER BY is a subset of GROUP BY.  Likewise if
+                * we have both DISTINCT and GROUP BY.
                 */
-               if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
+               if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys) ||
+                       !pathkeys_contained_in(root->distinct_pathkeys, root->group_pathkeys))
                        tuple_fraction = 0.0;
        }
        else if (parse->hasAggs || root->hasHavingQual)
index 40a9bffaf37e3db17aed317e80d69153fc207497..d8c6942e250aa49d426f51cae270fe38ee40859e 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.237 2008/08/03 19:10:52 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.238 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,12 +69,17 @@ static double preprocess_limit(PlannerInfo *root,
                                 int64 *offset_est, int64 *count_est);
 static void preprocess_groupclause(PlannerInfo *root);
 static Oid *extract_grouping_ops(List *groupClause);
+static AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
 static bool grouping_is_sortable(List *groupClause);
 static bool grouping_is_hashable(List *groupClause);
 static bool choose_hashed_grouping(PlannerInfo *root,
                                           double tuple_fraction, double limit_tuples,
                                           Path *cheapest_path, Path *sorted_path,
                                           double dNumGroups, AggClauseCounts *agg_counts);
+static bool choose_hashed_distinct(PlannerInfo *root,
+                                          Plan *input_plan, List *input_pathkeys,
+                                          double tuple_fraction, double limit_tuples,
+                                          double dNumDistinctRows);
 static List *make_subplanTargetList(PlannerInfo *root, List *tlist,
                                           AttrNumber **groupColIdx, bool *need_tlist_eval);
 static void locate_grouping_columns(PlannerInfo *root,
@@ -757,7 +762,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
        double          limit_tuples = -1.0;
        Plan       *result_plan;
        List       *current_pathkeys;
-       List       *sort_pathkeys;
        double          dNumGroups = 0;
 
        /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
@@ -829,16 +833,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                 * Calculate pathkeys that represent result ordering requirements
                 */
                Assert(parse->distinctClause == NIL);
-               sort_pathkeys = make_pathkeys_for_sortclauses(root,
-                                                                                                         parse->sortClause,
-                                                                                                         tlist,
-                                                                                                         true);
+               root->sort_pathkeys = make_pathkeys_for_sortclauses(root,
+                                                                                                                       parse->sortClause,
+                                                                                                                       tlist,
+                                                                                                                       true);
        }
        else
        {
                /* No set operations, do regular planning */
                List       *sub_tlist;
-               List       *group_pathkeys;
                AttrNumber *groupColIdx = NULL;
                bool            need_tlist_eval = true;
                QualCost        tlist_cost;
@@ -870,14 +873,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                /*
                 * Calculate pathkeys that represent grouping/ordering requirements.
                 * Stash them in PlannerInfo so that query_planner can canonicalize
-                * them after EquivalenceClasses have been formed.
-                *
-                * Note: for the moment, DISTINCT is always implemented via sort/uniq,
-                * and we set the sort_pathkeys to be the more rigorous of the
-                * DISTINCT and ORDER BY requirements.  This should be changed
-                * someday, but DISTINCT ON is a bit of a problem ...
+                * them after EquivalenceClasses have been formed.  The sortClause
+                * is certainly sort-able, but GROUP BY and DISTINCT might not be,
+                * in which case we just leave their pathkeys empty.
                 */
-               if (parse->groupClause && grouping_is_sortable(parse->groupClause))
+               if (parse->groupClause &&
+                       grouping_is_sortable(parse->groupClause))
                        root->group_pathkeys =
                                make_pathkeys_for_sortclauses(root,
                                                                                          parse->groupClause,
@@ -886,18 +887,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                else
                        root->group_pathkeys = NIL;
 
-               if (list_length(parse->distinctClause) > list_length(parse->sortClause))
-                       root->sort_pathkeys =
+               if (parse->distinctClause &&
+                       grouping_is_sortable(parse->distinctClause))
+                       root->distinct_pathkeys =
                                make_pathkeys_for_sortclauses(root,
                                                                                          parse->distinctClause,
                                                                                          tlist,
                                                                                          false);
                else
-                       root->sort_pathkeys =
-                               make_pathkeys_for_sortclauses(root,
-                                                                                         parse->sortClause,
-                                                                                         tlist,
-                                                                                         false);
+                       root->distinct_pathkeys = NIL;
+
+               root->sort_pathkeys =
+                       make_pathkeys_for_sortclauses(root,
+                                                                                 parse->sortClause,
+                                                                                 tlist,
+                                                                                 false);
 
                /*
                 * Will need actual number of aggregates for estimating costs.
@@ -917,17 +921,27 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                }
 
                /*
-                * Figure out whether we need a sorted result from query_planner.
+                * Figure out whether we want a sorted result from query_planner.
                 *
                 * If we have a sortable GROUP BY clause, then we want a result sorted
-                * properly for grouping.  Otherwise, if there is an ORDER BY clause,
-                * we want to sort by the ORDER BY clause. (Note: if we have both, and
-                * ORDER BY is a superset of GROUP BY, it would be tempting to request
-                * sort by ORDER BY --- but that might just leave us failing to
-                * exploit an available sort order at all. Needs more thought...)
+                * properly for grouping.  Otherwise, if there's a sortable DISTINCT
+                * clause that's more rigorous than the ORDER BY clause, we try to
+                * produce output that's sufficiently well sorted for the DISTINCT.
+                * Otherwise, if there is an ORDER BY clause, we want to sort by the
+                * ORDER BY clause.
+                *
+                * Note: if we have both ORDER BY and GROUP BY, and ORDER BY is a
+                * superset of GROUP BY, it would be tempting to request sort by ORDER
+                * BY --- but that might just leave us failing to exploit an available
+                * sort order at all.  Needs more thought.  The choice for DISTINCT
+                * versus ORDER BY is much easier, since we know that the parser
+                * ensured that one is a superset of the other.
                 */
                if (root->group_pathkeys)
                        root->query_pathkeys = root->group_pathkeys;
+               else if (list_length(root->distinct_pathkeys) >
+                                list_length(root->sort_pathkeys))
+                       root->query_pathkeys = root->distinct_pathkeys;
                else if (root->sort_pathkeys)
                        root->query_pathkeys = root->sort_pathkeys;
                else
@@ -942,9 +956,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                query_planner(root, sub_tlist, tuple_fraction, limit_tuples,
                                          &cheapest_path, &sorted_path, &dNumGroups);
 
-               group_pathkeys = root->group_pathkeys;
-               sort_pathkeys = root->sort_pathkeys;
-
                /*
                 * If grouping, decide whether to use sorted or hashed grouping.
                 */
@@ -1024,7 +1035,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 
                        /* Detect if we'll need an explicit sort for grouping */
                        if (parse->groupClause && !use_hashed_grouping &&
-                               !pathkeys_contained_in(group_pathkeys, current_pathkeys))
+                               !pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
                        {
                                need_sort_for_grouping = true;
                                /*
@@ -1135,7 +1146,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                                                                                                         parse->groupClause,
                                                                                                         groupColIdx,
                                                                                                         result_plan);
-                                               current_pathkeys = group_pathkeys;
+                                               current_pathkeys = root->group_pathkeys;
                                        }
                                        aggstrategy = AGG_SORTED;
 
@@ -1178,7 +1189,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                                                                                                 parse->groupClause,
                                                                                                 groupColIdx,
                                                                                                 result_plan);
-                                       current_pathkeys = group_pathkeys;
+                                       current_pathkeys = root->group_pathkeys;
                                }
 
                                result_plan = (Plan *) make_group(root,
@@ -1214,35 +1225,129 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
        }                                                       /* end of if (setOperations) */
 
        /*
-        * If we were not able to make the plan come out in the right order, add
-        * an explicit sort step.
+        * If there is a DISTINCT clause, add the necessary node(s).
         */
-       if (sort_pathkeys)
+       if (parse->distinctClause)
        {
-               if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
+               double  dNumDistinctRows;
+               long    numDistinctRows;
+               bool    use_hashed_distinct;
+               bool    can_sort;
+               bool    can_hash;
+
+               /*
+                * If there was grouping or aggregation, use the current number of
+                * rows as the estimated number of DISTINCT rows (ie, assume the
+                * result was already mostly unique).  If not, use the number of
+                * distinct-groups calculated by query_planner.
+                */
+               if (parse->groupClause || root->hasHavingQual || parse->hasAggs)
+                       dNumDistinctRows = result_plan->plan_rows;
+               else
+                       dNumDistinctRows = dNumGroups;
+
+               /* Also convert to long int --- but 'ware overflow! */
+               numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX);
+
+               /*
+                * If we have a sortable DISTINCT ON clause, we always use sorting.
+                * This enforces the expected behavior of DISTINCT ON.
+                */
+               can_sort = grouping_is_sortable(parse->distinctClause);
+               if (can_sort && parse->hasDistinctOn)
+                       use_hashed_distinct = false;
+               else
                {
-                       result_plan = (Plan *) make_sort_from_pathkeys(root,
-                                                                                                                  result_plan,
-                                                                                                                  sort_pathkeys,
-                                                                                                                  limit_tuples);
-                       current_pathkeys = sort_pathkeys;
+                       can_hash = grouping_is_hashable(parse->distinctClause);
+                       if (can_hash && can_sort)
+                       {
+                               /* we have a meaningful choice to make ... */
+                               use_hashed_distinct =
+                                       choose_hashed_distinct(root,
+                                                                                  result_plan, current_pathkeys,
+                                                                                  tuple_fraction, limit_tuples,
+                                                                                  dNumDistinctRows);
+                       }
+                       else if (can_hash)
+                               use_hashed_distinct = true;
+                       else if (can_sort)
+                               use_hashed_distinct = false;
+                       else
+                       {
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                errmsg("could not implement DISTINCT"),
+                                                errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+                               use_hashed_distinct = false; /* keep compiler quiet */
+                       }
+               }
+
+               if (use_hashed_distinct)
+               {
+                       /* Hashed aggregate plan --- no sort needed */
+                       result_plan = (Plan *) make_agg(root,
+                                                                                       result_plan->targetlist,
+                                                                                       NIL,
+                                                                                       AGG_HASHED,
+                                                                                       list_length(parse->distinctClause),
+                                                                                       extract_grouping_cols(parse->distinctClause,
+                                                                                                                                 result_plan->targetlist),
+                                                                                       extract_grouping_ops(parse->distinctClause),
+                                                                                       numDistinctRows,
+                                                                                       0,
+                                                                                       result_plan);
+                       /* Hashed aggregation produces randomly-ordered results */
+                       current_pathkeys = NIL;
+               }
+               else
+               {
+                       /*
+                        * Use a Unique node to implement DISTINCT.  Add an explicit sort
+                        * if we couldn't make the path come out the way the Unique node
+                        * needs it.  If we do have to sort, sort by the more rigorous
+                        * of DISTINCT and ORDER BY, to avoid a second sort below.
+                        */
+                       if (!pathkeys_contained_in(root->distinct_pathkeys,
+                                                                          current_pathkeys))
+                       {
+                               if (list_length(root->distinct_pathkeys) >=
+                                       list_length(root->sort_pathkeys))
+                                       current_pathkeys = root->distinct_pathkeys;
+                               else
+                               {
+                                       current_pathkeys = root->sort_pathkeys;
+                                       /* Assert checks that parser didn't mess up... */
+                                       Assert(pathkeys_contained_in(root->distinct_pathkeys,
+                                                                                                current_pathkeys));
+                               }
+
+                               result_plan = (Plan *) make_sort_from_pathkeys(root,
+                                                                                                                          result_plan,
+                                                                                                                          current_pathkeys,
+                                                                                                                          -1.0);
+                       }
+
+                       result_plan = (Plan *) make_unique(result_plan,
+                                                                                          parse->distinctClause);
+                       result_plan->plan_rows = dNumDistinctRows;
+                       /* The Unique node won't change sort ordering */
                }
        }
 
        /*
-        * If there is a DISTINCT clause, add the UNIQUE node.
+        * If ORDER BY was given and we were not able to make the plan come out in
+        * the right order, add an explicit sort step.
         */
-       if (parse->distinctClause)
+       if (parse->sortClause)
        {
-               result_plan = (Plan *) make_unique(result_plan, parse->distinctClause);
-
-               /*
-                * If there was grouping or aggregation, leave plan_rows as-is (ie,
-                * assume the result was already mostly unique).  If not, use the
-                * number of distinct-groups calculated by query_planner.
-                */
-               if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs)
-                       result_plan->plan_rows = dNumGroups;
+               if (!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
+               {
+                       result_plan = (Plan *) make_sort_from_pathkeys(root,
+                                                                                                                  result_plan,
+                                                                                                                  root->sort_pathkeys,
+                                                                                                                  limit_tuples);
+                       current_pathkeys = root->sort_pathkeys;
+               }
        }
 
        /*
@@ -1622,6 +1727,31 @@ extract_grouping_ops(List *groupClause)
        return groupOperators;
 }
 
+/*
+ * extract_grouping_cols - make an array of the grouping column resnos
+ *             for a SortGroupClause list
+ */
+static AttrNumber *
+extract_grouping_cols(List *groupClause, List *tlist)
+{
+       AttrNumber *grpColIdx;
+       int                     numCols = list_length(groupClause);
+       int                     colno = 0;
+       ListCell   *glitem;
+
+       grpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols);
+
+       foreach(glitem, groupClause)
+       {
+               SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem);
+               TargetEntry *tle = get_sortgroupclause_tle(groupcl, tlist);
+
+               grpColIdx[colno++] = tle->resno;
+       }
+
+       return grpColIdx;
+}
+
 /*
  * grouping_is_sortable - is it possible to implement grouping list by sorting?
  *
@@ -1680,6 +1810,7 @@ choose_hashed_grouping(PlannerInfo *root,
        double          cheapest_path_rows;
        int                     cheapest_path_width;
        Size            hashentrysize;
+       List       *target_pathkeys;
        List       *current_pathkeys;
        Path            hashed_p;
        Path            sorted_p;
@@ -1716,6 +1847,20 @@ choose_hashed_grouping(PlannerInfo *root,
        if (hashentrysize * dNumGroups > work_mem * 1024L)
                return false;
 
+       /*
+        * When we have both GROUP BY and DISTINCT, use the more-rigorous of
+        * DISTINCT and ORDER BY as the assumed required output sort order.
+        * This is an oversimplification because the DISTINCT might get
+        * implemented via hashing, but it's not clear that the case is common
+        * enough (or that our estimates are good enough) to justify trying to
+        * solve it exactly.
+        */
+       if (list_length(root->distinct_pathkeys) >
+               list_length(root->sort_pathkeys))
+               target_pathkeys = root->distinct_pathkeys;
+       else
+               target_pathkeys = root->sort_pathkeys;
+
        /*
         * See if the estimated cost is no more than doing it the other way. While
         * avoiding the need for sorted input is usually a win, the fact that the
@@ -1737,8 +1882,8 @@ choose_hashed_grouping(PlannerInfo *root,
                         cheapest_path->startup_cost, cheapest_path->total_cost,
                         cheapest_path_rows);
        /* Result of hashed agg is always unsorted */
-       if (root->sort_pathkeys)
-               cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
+       if (target_pathkeys)
+               cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost,
                                  dNumGroups, cheapest_path_width, limit_tuples);
 
        if (sorted_path)
@@ -1770,9 +1915,9 @@ choose_hashed_grouping(PlannerInfo *root,
                                   sorted_p.startup_cost, sorted_p.total_cost,
                                   cheapest_path_rows);
        /* The Agg or Group node will preserve ordering */
-       if (root->sort_pathkeys &&
-               !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
-               cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
+       if (target_pathkeys &&
+               !pathkeys_contained_in(target_pathkeys, current_pathkeys))
+               cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost,
                                  dNumGroups, cheapest_path_width, limit_tuples);
 
        /*
@@ -1791,6 +1936,111 @@ choose_hashed_grouping(PlannerInfo *root,
        return false;
 }
 
+/*
+ * choose_hashed_distinct - should we use hashing for DISTINCT?
+ *
+ * This is fairly similar to choose_hashed_grouping, but there are enough
+ * differences that it doesn't seem worth trying to unify the two functions.
+ *
+ * But note that making the two choices independently is a bit bogus in
+ * itself.  If the two could be combined into a single choice operation
+ * it'd probably be better, but that seems far too unwieldy to be practical,
+ * especially considering that the combination of GROUP BY and DISTINCT
+ * isn't very common in real queries.  By separating them, we are giving
+ * extra preference to using a sorting implementation when a common sort key
+ * is available ... and that's not necessarily wrong anyway.
+ *
+ * Note: this is only applied when both alternatives are actually feasible.
+ */
+static bool
+choose_hashed_distinct(PlannerInfo *root,
+                                          Plan *input_plan, List *input_pathkeys,
+                                          double tuple_fraction, double limit_tuples,
+                                          double dNumDistinctRows)
+{
+       int                     numDistinctCols = list_length(root->parse->distinctClause);
+       Size            hashentrysize;
+       List       *current_pathkeys;
+       Path            hashed_p;
+       Path            sorted_p;
+
+       /* Prefer sorting when enable_hashagg is off */
+       if (!enable_hashagg)
+               return false;
+
+       /*
+        * Don't do it if it doesn't look like the hashtable will fit into
+        * work_mem.
+        */
+       hashentrysize = MAXALIGN(input_plan->plan_width) + MAXALIGN(sizeof(MinimalTupleData));
+
+       if (hashentrysize * dNumDistinctRows > work_mem * 1024L)
+               return false;
+
+       /*
+        * See if the estimated cost is no more than doing it the other way. While
+        * avoiding the need for sorted input is usually a win, the fact that the
+        * output won't be sorted may be a loss; so we need to do an actual cost
+        * comparison.
+        *
+        * We need to consider input_plan + hashagg [+ final sort] versus
+        * input_plan [+ sort] + group [+ final sort] where brackets indicate
+        * a step that may not be needed.
+        *
+        * These path variables are dummies that just hold cost fields; we don't
+        * make actual Paths for these steps.
+        */
+       cost_agg(&hashed_p, root, AGG_HASHED, 0,
+                        numDistinctCols, dNumDistinctRows,
+                        input_plan->startup_cost, input_plan->total_cost,
+                        input_plan->plan_rows);
+       /*
+        * Result of hashed agg is always unsorted, so if ORDER BY is present
+        * we need to charge for the final sort.
+        */
+       if (root->parse->sortClause)
+               cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
+                                 dNumDistinctRows, input_plan->plan_width, limit_tuples);
+
+       /* Now for the GROUP case ... */
+       sorted_p.startup_cost = input_plan->startup_cost;
+       sorted_p.total_cost = input_plan->total_cost;
+       current_pathkeys = input_pathkeys;
+       if (!pathkeys_contained_in(root->distinct_pathkeys, current_pathkeys))
+       {
+               /* We don't want to sort twice */
+               if (list_length(root->distinct_pathkeys) >=
+                       list_length(root->sort_pathkeys))
+                       current_pathkeys = root->distinct_pathkeys;
+               else
+                       current_pathkeys = root->sort_pathkeys;
+               cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost,
+                                 input_plan->plan_rows, input_plan->plan_width, -1.0);
+       }
+       cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows,
+                          sorted_p.startup_cost, sorted_p.total_cost,
+                          input_plan->plan_rows);
+       if (root->parse->sortClause &&
+               !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
+               cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
+                                 dNumDistinctRows, input_plan->plan_width, limit_tuples);
+
+       /*
+        * Now make the decision using the top-level tuple fraction.  First we
+        * have to convert an absolute count (LIMIT) into fractional form.
+        */
+       if (tuple_fraction >= 1.0)
+               tuple_fraction /= dNumDistinctRows;
+
+       if (compare_fractional_path_costs(&hashed_p, &sorted_p,
+                                                                         tuple_fraction) < 0)
+       {
+               /* Hashed is cheaper, so use it */
+               return true;
+       }
+       return false;
+}
+
 /*---------------
  * make_subplanTargetList
  *       Generate appropriate target list when grouping is required.
@@ -1857,7 +2107,7 @@ make_subplanTargetList(PlannerInfo *root,
 
        /*
         * Otherwise, start with a "flattened" tlist (having just the vars
-        * mentioned in the targetlist and HAVING qual --- but not upper- level
+        * mentioned in the targetlist and HAVING qual --- but not upper-level
         * Vars; they will be replaced by Params later on).
         */
        sub_tlist = flatten_tlist(tlist);
@@ -1886,16 +2136,28 @@ make_subplanTargetList(PlannerInfo *root,
                        SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl);
                        Node       *groupexpr = get_sortgroupclause_expr(grpcl, tlist);
                        TargetEntry *te = NULL;
-                       ListCell   *sl;
 
-                       /* Find or make a matching sub_tlist entry */
-                       foreach(sl, sub_tlist)
+                       /*
+                        * Find or make a matching sub_tlist entry.  If the groupexpr
+                        * isn't a Var, no point in searching.  (Note that the parser
+                        * won't make multiple groupClause entries for the same TLE.)
+                        */
+                       if (groupexpr && IsA(groupexpr, Var))
                        {
-                               te = (TargetEntry *) lfirst(sl);
-                               if (equal(groupexpr, te->expr))
-                                       break;
+                               ListCell   *sl;
+
+                               foreach(sl, sub_tlist)
+                               {
+                                       TargetEntry *lte = (TargetEntry *) lfirst(sl);
+
+                                       if (equal(groupexpr, lte->expr))
+                                       {
+                                               te = lte;
+                                               break;
+                                       }
+                               }
                        }
-                       if (!sl)
+                       if (!te)
                        {
                                te = makeTargetEntry((Expr *) groupexpr,
                                                                         list_length(sub_tlist) + 1,
index 76e59c82d6edee77066ce70064769b29999ae7bd..2b04ee5e33783301dc2450cd9b22c8798e01520e 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.173 2008/08/03 19:10:52 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.174 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1447,9 +1447,6 @@ transformDistinctClause(ParseState *pstate,
        /*
         * Now add any remaining non-resjunk tlist items, using default
         * sort/group semantics for their data types.
-        *
-        * XXX for now, the planner requires distinctClause to be sortable,
-        * so we have to insist on that here.
         */
        foreach(tlitem, *targetlist)
        {
@@ -1459,8 +1456,7 @@ transformDistinctClause(ParseState *pstate,
                        continue;                       /* ignore junk */
                result = addTargetToGroupList(pstate, tle,
                                                                          result, *targetlist,
-                                                                         true, /* XXX for now */
-                                                                         true);
+                                                                         false, true);
        }
 
        return result;
@@ -1555,8 +1551,7 @@ transformDistinctOnClause(ParseState *pstate, List *distinctlist,
                                         errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
                result = addTargetToGroupList(pstate, tle,
                                                                          result, *targetlist,
-                                                                         true, /* someday allow hash-only? */
-                                                                         true);
+                                                                         false, true);
        }
 
        return result;
index 8476d7e85c141bc0a06273cd3a5ccbad883f52d5..f8c23071661b50086aa26287915fc5082354a1ae 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.156 2008/04/21 20:54:15 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.157 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -162,8 +162,9 @@ typedef struct PlannerInfo
        List       *query_pathkeys; /* desired pathkeys for query_planner(), and
                                                                 * actual pathkeys afterwards */
 
-       List       *group_pathkeys; /* groupClause pathkeys, if any */
-       List       *sort_pathkeys;      /* sortClause pathkeys, if any */
+       List       *group_pathkeys;             /* groupClause pathkeys, if any */
+       List       *distinct_pathkeys;  /* distinctClause pathkeys, if any */
+       List       *sort_pathkeys;              /* sortClause pathkeys, if any */
 
        List       *initial_rels;       /* RelOptInfos we are now trying to join */
 
index c5ad36fdd32df016b14ccd5ee49823117619660d..d404d9db68121049455a14ab3a97abfbed467187 100644 (file)
@@ -79,7 +79,7 @@ INSERT INTO TEMP_GROUP
 INSERT INTO TEMP_GROUP
   SELECT 2, i.f1, f.f1
   FROM INT4_TBL i, FLOAT8_TBL f;
-SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
+SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
  two 
 -----
    1
index 69f0efb8d42206d2a829cbba1e9eed96b6c15add..533bac3ab6705bc6ff5be990b68a3e84f7e4cf11 100644 (file)
@@ -129,7 +129,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.prorettype < p2.prorettype);
+    (p1.prorettype < p2.prorettype)
+ORDER BY 1, 2;
  prorettype | prorettype 
 ------------+------------
          25 |       1043
@@ -142,7 +143,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[0] < p2.proargtypes[0]);
+    (p1.proargtypes[0] < p2.proargtypes[0])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
           25 |        1042
@@ -158,7 +160,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[1] < p2.proargtypes[1]);
+    (p1.proargtypes[1] < p2.proargtypes[1])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
           23 |          28
@@ -173,7 +176,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[2] < p2.proargtypes[2]);
+    (p1.proargtypes[2] < p2.proargtypes[2])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
         1114 |        1184
@@ -185,7 +189,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[3] < p2.proargtypes[3]);
+    (p1.proargtypes[3] < p2.proargtypes[3])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
         1114 |        1184
@@ -197,7 +202,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[4] < p2.proargtypes[4]);
+    (p1.proargtypes[4] < p2.proargtypes[4])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -208,7 +214,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[5] < p2.proargtypes[5]);
+    (p1.proargtypes[5] < p2.proargtypes[5])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -219,7 +226,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[6] < p2.proargtypes[6]);
+    (p1.proargtypes[6] < p2.proargtypes[6])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -230,7 +238,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[7] < p2.proargtypes[7]);
+    (p1.proargtypes[7] < p2.proargtypes[7])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
index abe34ae7ae5966a54d21f216cfdade115667ae43..fe64fe0d9c175909a93ce388b3ffd121a5d46628 100644 (file)
@@ -14,7 +14,7 @@ SELECT DISTINCT two FROM tmp;
 --
 -- awk '{print $5;}' onek.data | sort -n | uniq
 --
-SELECT DISTINCT ten FROM tmp;
+SELECT DISTINCT ten FROM tmp ORDER BY 1;
  ten 
 -----
    0
@@ -32,7 +32,7 @@ SELECT DISTINCT ten FROM tmp;
 --
 -- awk '{print $16;}' onek.data | sort -d | uniq
 --
-SELECT DISTINCT string4 FROM tmp;
+SELECT DISTINCT string4 FROM tmp ORDER BY 1;
  string4 
 ---------
  AAAAxx
index c82b1405978628f002254f10abb0cba52f83fef2..c5813630625ec965e79d482626157d2942fc6fc6 100644 (file)
@@ -183,7 +183,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
 -- the next two queries demonstrate how functions generate bogus duplicates.
 -- this is a "feature" ..
 --
-SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
+SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
+  ORDER BY 1,2;
 
 SELECT hobbies_r.name, (hobbies_r.equipment).name FROM hobbies_r;
 
index e409c0a10019c2c99d1f91ba9a853e482616cafa..91e0d2b04d79dab77c3dab10c274e7a496c856f1 100644 (file)
@@ -469,7 +469,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
 -- the next two queries demonstrate how functions generate bogus duplicates.
 -- this is a "feature" ..
 --
-SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
+SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
+  ORDER BY 1,2;
     name     |     name      
 -------------+---------------
  basketball  | hightops
index 2220fdba38587ea1599124ac01b3a2cc8ddf0dc1..6626cf20ebc5d10a38b093b98bab812e2b9b4972 100644 (file)
@@ -63,7 +63,7 @@ INSERT INTO TEMP_GROUP
   SELECT 2, i.f1, f.f1
   FROM INT4_TBL i, FLOAT8_TBL f;
 
-SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
+SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
 
 SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
   FROM TEMP_GROUP
index e2ab6e57d4a36790b8b5b73db41f7e5ac2a85206..5017849830aa612d14a1359eb8c2cd3f9e622f7f 100644 (file)
@@ -121,7 +121,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.prorettype < p2.prorettype);
+    (p1.prorettype < p2.prorettype)
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[0], p2.proargtypes[0]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -129,7 +130,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[0] < p2.proargtypes[0]);
+    (p1.proargtypes[0] < p2.proargtypes[0])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[1], p2.proargtypes[1]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -137,7 +139,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[1] < p2.proargtypes[1]);
+    (p1.proargtypes[1] < p2.proargtypes[1])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[2], p2.proargtypes[2]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -145,7 +148,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[2] < p2.proargtypes[2]);
+    (p1.proargtypes[2] < p2.proargtypes[2])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[3], p2.proargtypes[3]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -153,7 +157,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[3] < p2.proargtypes[3]);
+    (p1.proargtypes[3] < p2.proargtypes[3])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[4], p2.proargtypes[4]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -161,7 +166,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[4] < p2.proargtypes[4]);
+    (p1.proargtypes[4] < p2.proargtypes[4])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[5], p2.proargtypes[5]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -169,7 +175,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[5] < p2.proargtypes[5]);
+    (p1.proargtypes[5] < p2.proargtypes[5])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[6], p2.proargtypes[6]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -177,7 +184,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[6] < p2.proargtypes[6]);
+    (p1.proargtypes[6] < p2.proargtypes[6])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[7], p2.proargtypes[7]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -185,7 +193,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[7] < p2.proargtypes[7]);
+    (p1.proargtypes[7] < p2.proargtypes[7])
+ORDER BY 1, 2;
 
 -- Look for functions that return type "internal" and do not have any
 -- "internal" argument.  Such a function would be a security hole since
index c4a63aaf16fd9788b10523cc4d173bf31520bf65..7416e0194e172090810e7ba70fb4cfc00b618220 100644 (file)
@@ -10,12 +10,12 @@ SELECT DISTINCT two FROM tmp;
 --
 -- awk '{print $5;}' onek.data | sort -n | uniq
 --
-SELECT DISTINCT ten FROM tmp;
+SELECT DISTINCT ten FROM tmp ORDER BY 1;
 
 --
 -- awk '{print $16;}' onek.data | sort -d | uniq
 --
-SELECT DISTINCT string4 FROM tmp;
+SELECT DISTINCT string4 FROM tmp ORDER BY 1;
 
 --
 -- awk '{print $3,$16,$5;}' onek.data | sort -d | uniq |