From 76b6ee3f380b90956a1ca18f15e2a70874651f66 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Feb 2010 03:38:35 +0000 Subject: [PATCH] Improve planner's choices about when to use hashing vs sorting for DISTINCT. The previous coding missed a bet by sometimes picking the "sorted" path from query_planner even though hashing would be preferable. To fix, we have to be willing to make the choice sooner. This contorts things a little bit, but I thought of a factorization that makes it not too awful. --- src/backend/optimizer/plan/planner.c | 271 ++++++++++++++++----------- 1 file changed, 160 insertions(+), 111 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 6c58442e59..3748c83fd6 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.263 2010/01/02 16:57:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.264 2010/02/10 03:38:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,11 +71,15 @@ static double preprocess_limit(PlannerInfo *root, static void preprocess_groupclause(PlannerInfo *root); static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, double limit_tuples, + double path_rows, int path_width, Path *cheapest_path, Path *sorted_path, double dNumGroups, AggClauseCounts *agg_counts); static bool choose_hashed_distinct(PlannerInfo *root, - Plan *input_plan, List *input_pathkeys, double tuple_fraction, double limit_tuples, + double path_rows, int path_width, + Cost cheapest_startup_cost, Cost cheapest_total_cost, + Cost sorted_startup_cost, Cost sorted_total_cost, + List *sorted_pathkeys, double dNumDistinctRows); static List *make_subplanTargetList(PlannerInfo *root, List *tlist, AttrNumber **groupColIdx, bool *need_tlist_eval); @@ -855,6 +859,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) Plan *result_plan; List *current_pathkeys; double dNumGroups = 0; + bool use_hashed_distinct = false; + bool tested_hashed_distinct = false; /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) @@ -945,6 +951,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) long numGroups = 0; AggClauseCounts agg_counts; int numGroupCols; + double path_rows; + int path_width; bool use_hashed_grouping = false; WindowFuncLists *wflists = NULL; List *activeWindows = NIL; @@ -1088,51 +1096,62 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) &cheapest_path, &sorted_path, &dNumGroups); /* - * If grouping, decide whether to use sorted or hashed grouping. + * Extract rowcount and width estimates for possible use in grouping + * decisions. Beware here of the possibility that + * cheapest_path->parent is NULL (ie, there is no FROM clause). */ - if (parse->groupClause) + if (cheapest_path->parent) + { + path_rows = cheapest_path->parent->rows; + path_width = cheapest_path->parent->width; + } + else { - bool can_hash; - bool can_sort; + path_rows = 1; /* assume non-set result */ + path_width = 100; /* arbitrary */ + } + if (parse->groupClause) + { /* - * Executor doesn't support hashed aggregation with DISTINCT or - * ORDER BY aggregates. (Doing so would imply storing *all* the - * input values in the hash table, and/or running many sorts in - * parallel, either of which seems like a certain loser.) + * If grouping, decide whether to use sorted or hashed grouping. */ - can_hash = (agg_counts.numOrderedAggs == 0 && - grouping_is_hashable(parse->groupClause)); - can_sort = grouping_is_sortable(parse->groupClause); - if (can_hash && can_sort) - { - /* we have a meaningful choice to make ... */ - use_hashed_grouping = - choose_hashed_grouping(root, - tuple_fraction, limit_tuples, - cheapest_path, sorted_path, - dNumGroups, &agg_counts); - } - else if (can_hash) - use_hashed_grouping = true; - else if (can_sort) - use_hashed_grouping = false; - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement GROUP BY"), - errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - + use_hashed_grouping = + choose_hashed_grouping(root, + tuple_fraction, limit_tuples, + path_rows, path_width, + cheapest_path, sorted_path, + dNumGroups, &agg_counts); /* Also convert # groups to long int --- but 'ware overflow! */ numGroups = (long) Min(dNumGroups, (double) LONG_MAX); } + else if (parse->distinctClause && sorted_path && + !root->hasHavingQual && !parse->hasAggs && !activeWindows) + { + /* + * We'll reach the DISTINCT stage without any intermediate + * processing, so figure out whether we will want to hash or not + * so we can choose whether to use cheapest or sorted path. + */ + use_hashed_distinct = + choose_hashed_distinct(root, + tuple_fraction, limit_tuples, + path_rows, path_width, + cheapest_path->startup_cost, + cheapest_path->total_cost, + sorted_path->startup_cost, + sorted_path->total_cost, + sorted_path->pathkeys, + dNumGroups); + tested_hashed_distinct = true; + } /* * Select the best path. If we are doing hashed grouping, we will * always read all the input tuples, so use the cheapest-total path. * Otherwise, trust query_planner's decision about which to use. */ - if (use_hashed_grouping || !sorted_path) + if (use_hashed_grouping || use_hashed_distinct || !sorted_path) best_path = cheapest_path; else best_path = sorted_path; @@ -1506,9 +1525,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) { double dNumDistinctRows; long numDistinctRows; - bool use_hashed_distinct; - bool can_sort; - bool can_hash; /* * If there was grouping or aggregation, use the current number of @@ -1524,37 +1540,25 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) /* Also convert to long int --- but 'ware overflow! */ numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX); - /* - * If we have a sortable DISTINCT ON clause, we always use sorting. - * This enforces the expected behavior of DISTINCT ON. - */ - can_sort = grouping_is_sortable(parse->distinctClause); - if (can_sort && parse->hasDistinctOn) - use_hashed_distinct = false; - else + /* Choose implementation method if we didn't already */ + if (!tested_hashed_distinct) { - can_hash = grouping_is_hashable(parse->distinctClause); - if (can_hash && can_sort) - { - /* we have a meaningful choice to make ... */ - use_hashed_distinct = - choose_hashed_distinct(root, - result_plan, current_pathkeys, - tuple_fraction, limit_tuples, - dNumDistinctRows); - } - else if (can_hash) - use_hashed_distinct = true; - else if (can_sort) - use_hashed_distinct = false; - else - { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement DISTINCT"), - errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - use_hashed_distinct = false; /* keep compiler quiet */ - } + /* + * At this point, either hashed or sorted grouping will have to + * work from result_plan, so we pass that as both "cheapest" and + * "sorted". + */ + use_hashed_distinct = + choose_hashed_distinct(root, + tuple_fraction, limit_tuples, + result_plan->plan_rows, + result_plan->plan_width, + result_plan->startup_cost, + result_plan->total_cost, + result_plan->startup_cost, + result_plan->total_cost, + current_pathkeys, + dNumDistinctRows); } if (use_hashed_distinct) @@ -2155,23 +2159,49 @@ preprocess_groupclause(PlannerInfo *root) /* * choose_hashed_grouping - should we use hashed grouping? * - * Note: this is only applied when both alternatives are actually feasible. + * Returns TRUE to select hashing, FALSE to select sorting. */ static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, double limit_tuples, + double path_rows, int path_width, Path *cheapest_path, Path *sorted_path, double dNumGroups, AggClauseCounts *agg_counts) { - int numGroupCols = list_length(root->parse->groupClause); - double cheapest_path_rows; - int cheapest_path_width; + Query *parse = root->parse; + int numGroupCols = list_length(parse->groupClause); + bool can_hash; + bool can_sort; Size hashentrysize; List *target_pathkeys; List *current_pathkeys; Path hashed_p; Path sorted_p; + /* + * Executor doesn't support hashed aggregation with DISTINCT or ORDER BY + * aggregates. (Doing so would imply storing *all* the input values in + * the hash table, and/or running many sorts in parallel, either of which + * seems like a certain loser.) + */ + can_hash = (agg_counts->numOrderedAggs == 0 && + grouping_is_hashable(parse->groupClause)); + can_sort = grouping_is_sortable(parse->groupClause); + + /* Quick out if only one choice is workable */ + if (!(can_hash && can_sort)) + { + if (can_hash) + return true; + else if (can_sort) + return false; + else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement GROUP BY"), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); + } + /* Prefer sorting when enable_hashagg is off */ if (!enable_hashagg) return false; @@ -2179,23 +2209,10 @@ choose_hashed_grouping(PlannerInfo *root, /* * Don't do it if it doesn't look like the hashtable will fit into * work_mem. - * - * Beware here of the possibility that cheapest_path->parent is NULL. This - * could happen if user does something silly like SELECT 'foo' GROUP BY 1; */ - if (cheapest_path->parent) - { - cheapest_path_rows = cheapest_path->parent->rows; - cheapest_path_width = cheapest_path->parent->width; - } - else - { - cheapest_path_rows = 1; /* assume non-set result */ - cheapest_path_width = 100; /* arbitrary */ - } /* Estimate per-hash-entry space at tuple width... */ - hashentrysize = MAXALIGN(cheapest_path_width) + MAXALIGN(sizeof(MinimalTupleData)); + hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData)); /* plus space for pass-by-ref transition values... */ hashentrysize += agg_counts->transitionSpace; /* plus the per-hash-entry overhead */ @@ -2236,11 +2253,11 @@ choose_hashed_grouping(PlannerInfo *root, cost_agg(&hashed_p, root, AGG_HASHED, agg_counts->numAggs, numGroupCols, dNumGroups, cheapest_path->startup_cost, cheapest_path->total_cost, - cheapest_path_rows); + path_rows); /* Result of hashed agg is always unsorted */ if (target_pathkeys) cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost, - dNumGroups, cheapest_path_width, limit_tuples); + dNumGroups, path_width, limit_tuples); if (sorted_path) { @@ -2257,24 +2274,24 @@ choose_hashed_grouping(PlannerInfo *root, if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost, - cheapest_path_rows, cheapest_path_width, -1.0); + path_rows, path_width, -1.0); current_pathkeys = root->group_pathkeys; } - if (root->parse->hasAggs) + if (parse->hasAggs) cost_agg(&sorted_p, root, AGG_SORTED, agg_counts->numAggs, numGroupCols, dNumGroups, sorted_p.startup_cost, sorted_p.total_cost, - cheapest_path_rows); + path_rows); else cost_group(&sorted_p, root, numGroupCols, dNumGroups, sorted_p.startup_cost, sorted_p.total_cost, - cheapest_path_rows); + path_rows); /* The Agg or Group node will preserve ordering */ if (target_pathkeys && !pathkeys_contained_in(target_pathkeys, current_pathkeys)) cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost, - dNumGroups, cheapest_path_width, limit_tuples); + dNumGroups, path_width, limit_tuples); /* * Now make the decision using the top-level tuple fraction. First we @@ -2297,6 +2314,9 @@ choose_hashed_grouping(PlannerInfo *root, * * This is fairly similar to choose_hashed_grouping, but there are enough * differences that it doesn't seem worth trying to unify the two functions. + * (One difference is that we sometimes apply this after forming a Plan, + * so the input alternatives can't be represented as Paths --- instead we + * pass in the costs as individual variables.) * * But note that making the two choices independently is a bit bogus in * itself. If the two could be combined into a single choice operation @@ -2306,21 +2326,51 @@ choose_hashed_grouping(PlannerInfo *root, * extra preference to using a sorting implementation when a common sort key * is available ... and that's not necessarily wrong anyway. * - * Note: this is only applied when both alternatives are actually feasible. + * Returns TRUE to select hashing, FALSE to select sorting. */ static bool choose_hashed_distinct(PlannerInfo *root, - Plan *input_plan, List *input_pathkeys, double tuple_fraction, double limit_tuples, + double path_rows, int path_width, + Cost cheapest_startup_cost, Cost cheapest_total_cost, + Cost sorted_startup_cost, Cost sorted_total_cost, + List *sorted_pathkeys, double dNumDistinctRows) { - int numDistinctCols = list_length(root->parse->distinctClause); + Query *parse = root->parse; + int numDistinctCols = list_length(parse->distinctClause); + bool can_sort; + bool can_hash; Size hashentrysize; List *current_pathkeys; List *needed_pathkeys; Path hashed_p; Path sorted_p; + /* + * If we have a sortable DISTINCT ON clause, we always use sorting. + * This enforces the expected behavior of DISTINCT ON. + */ + can_sort = grouping_is_sortable(parse->distinctClause); + if (can_sort && parse->hasDistinctOn) + return false; + + can_hash = grouping_is_hashable(parse->distinctClause); + + /* Quick out if only one choice is workable */ + if (!(can_hash && can_sort)) + { + if (can_hash) + return true; + else if (can_sort) + return false; + else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement DISTINCT"), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); + } + /* Prefer sorting when enable_hashagg is off */ if (!enable_hashagg) return false; @@ -2329,7 +2379,7 @@ choose_hashed_distinct(PlannerInfo *root, * Don't do it if it doesn't look like the hashtable will fit into * work_mem. */ - hashentrysize = MAXALIGN(input_plan->plan_width) + MAXALIGN(sizeof(MinimalTupleData)); + hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData)); if (hashentrysize * dNumDistinctRows > work_mem * 1024L) return false; @@ -2340,8 +2390,8 @@ choose_hashed_distinct(PlannerInfo *root, * output won't be sorted may be a loss; so we need to do an actual cost * comparison. * - * We need to consider input_plan + hashagg [+ final sort] versus - * input_plan [+ sort] + group [+ final sort] where brackets indicate a + * We need to consider cheapest_path + hashagg [+ final sort] versus + * sorted_path [+ sort] + group [+ final sort] where brackets indicate a * step that may not be needed. * * These path variables are dummies that just hold cost fields; we don't @@ -2349,25 +2399,25 @@ choose_hashed_distinct(PlannerInfo *root, */ cost_agg(&hashed_p, root, AGG_HASHED, 0, numDistinctCols, dNumDistinctRows, - input_plan->startup_cost, input_plan->total_cost, - input_plan->plan_rows); + cheapest_startup_cost, cheapest_total_cost, + path_rows); /* * Result of hashed agg is always unsorted, so if ORDER BY is present we * need to charge for the final sort. */ - if (root->parse->sortClause) + if (parse->sortClause) cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost, - dNumDistinctRows, input_plan->plan_width, limit_tuples); + dNumDistinctRows, path_width, limit_tuples); /* * Now for the GROUP case. See comments in grouping_planner about the * sorting choices here --- this code should match that code. */ - sorted_p.startup_cost = input_plan->startup_cost; - sorted_p.total_cost = input_plan->total_cost; - current_pathkeys = input_pathkeys; - if (root->parse->hasDistinctOn && + sorted_p.startup_cost = sorted_startup_cost; + sorted_p.total_cost = sorted_total_cost; + current_pathkeys = sorted_pathkeys; + if (parse->hasDistinctOn && list_length(root->distinct_pathkeys) < list_length(root->sort_pathkeys)) needed_pathkeys = root->sort_pathkeys; @@ -2381,15 +2431,15 @@ choose_hashed_distinct(PlannerInfo *root, else current_pathkeys = root->sort_pathkeys; cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost, - input_plan->plan_rows, input_plan->plan_width, -1.0); + path_rows, path_width, -1.0); } cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows, sorted_p.startup_cost, sorted_p.total_cost, - input_plan->plan_rows); - if (root->parse->sortClause && + path_rows); + if (parse->sortClause && !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys)) cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost, - dNumDistinctRows, input_plan->plan_width, limit_tuples); + dNumDistinctRows, path_width, limit_tuples); /* * Now make the decision using the top-level tuple fraction. First we @@ -2407,7 +2457,7 @@ choose_hashed_distinct(PlannerInfo *root, return false; } -/*--------------- +/* * make_subplanTargetList * Generate appropriate target list when grouping is required. * @@ -2446,7 +2496,6 @@ choose_hashed_distinct(PlannerInfo *root, * result tlist. * * The result is the targetlist to be passed to the subplan. - *--------------- */ static List * make_subplanTargetList(PlannerInfo *root, -- 2.40.0