From c78248c91d5147a45907cb05d2c424cf4a3a792d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 5 Aug 2008 16:03:10 +0000
Subject: [PATCH] Department of second thoughts: fix newly-added code in
 planner.c to make real sure that DISTINCT ON does what it's supposed to, ie,
 sort by the full ORDER BY list before unique-ifying.  The error seems masked
 in simple cases by the fact that query_planner won't return query pathkeys
 that only partially match the requested sort order, but I wouldn't want to
 bet that it couldn't be exposed in some way or other.

---
 src/backend/optimizer/plan/planner.c | 38 ++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d8c6942e25..5b7fde2533 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.238 2008/08/05 02:43:17 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.239 2008/08/05 16:03:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1304,11 +1304,24 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 			/*
 			 * Use a Unique node to implement DISTINCT.  Add an explicit sort
 			 * if we couldn't make the path come out the way the Unique node
-			 * needs it.  If we do have to sort, sort by the more rigorous
-			 * of DISTINCT and ORDER BY, to avoid a second sort below.
+			 * needs it.  If we do have to sort, always sort by the more
+			 * rigorous of DISTINCT and ORDER BY, to avoid a second sort
+			 * below.  However, for regular DISTINCT, don't sort now if we
+			 * don't have to --- sorting afterwards will likely be cheaper,
+			 * and also has the possibility of optimizing via LIMIT.  But
+			 * for DISTINCT ON, we *must* force the final sort now, else
+			 * it won't have the desired behavior.
 			 */
-			if (!pathkeys_contained_in(root->distinct_pathkeys,
-									   current_pathkeys))
+			List   *needed_pathkeys;
+
+			if (parse->hasDistinctOn &&
+				list_length(root->distinct_pathkeys) <
+				list_length(root->sort_pathkeys))
+				needed_pathkeys = root->sort_pathkeys;
+			else
+				needed_pathkeys = root->distinct_pathkeys;
+
+			if (!pathkeys_contained_in(needed_pathkeys, current_pathkeys))
 			{
 				if (list_length(root->distinct_pathkeys) >=
 					list_length(root->sort_pathkeys))
@@ -1961,6 +1974,7 @@ choose_hashed_distinct(PlannerInfo *root,
 	int			numDistinctCols = list_length(root->parse->distinctClause);
 	Size		hashentrysize;
 	List	   *current_pathkeys;
+	List	   *needed_pathkeys;
 	Path		hashed_p;
 	Path		sorted_p;
 
@@ -2002,13 +2016,21 @@ choose_hashed_distinct(PlannerInfo *root,
 		cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
 				  dNumDistinctRows, input_plan->plan_width, limit_tuples);
 
-	/* Now for the GROUP case ... */
+	/*
+	 * Now for the GROUP case.  See comments in grouping_planner about the
+	 * sorting choices here --- this code should match that code.
+	 */
 	sorted_p.startup_cost = input_plan->startup_cost;
 	sorted_p.total_cost = input_plan->total_cost;
 	current_pathkeys = input_pathkeys;
-	if (!pathkeys_contained_in(root->distinct_pathkeys, current_pathkeys))
+	if (root->parse->hasDistinctOn &&
+		list_length(root->distinct_pathkeys) <
+		list_length(root->sort_pathkeys))
+		needed_pathkeys = root->sort_pathkeys;
+	else
+		needed_pathkeys = root->distinct_pathkeys;
+	if (!pathkeys_contained_in(needed_pathkeys, current_pathkeys))
 	{
-		/* We don't want to sort twice */
 		if (list_length(root->distinct_pathkeys) >=
 			list_length(root->sort_pathkeys))
 			current_pathkeys = root->distinct_pathkeys;
-- 
2.50.0